wiseflow/core/scrapers/general_scraper.py

# -*- coding: utf-8 -*-

import os
from urllib.parse import urlparse
import re
from .simple_crawler import simple_crawler
from .mp_crawler import mp_crawler
import httpx
from bs4 import BeautifulSoup
from bs4.element import Comment
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from datetime import datetime, date
from requests.compat import urljoin
import chardet
from utils.general_utils import extract_and_convert_dates
import asyncio


model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


def tag_visible(element: Comment) -> bool:
    if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_soup(soup: BeautifulSoup) -> str:
    res = []
    texts = soup.find_all(string=True)
    visible_texts = filter(tag_visible, texts)
    for v in visible_texts:
        res.append(v)
    text = "\n".join(res)
    return text.strip()


def parse_html_content(out: str) -> dict:
    dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
    pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
    result = pattern.findall(out)
    result = result[0].strip()
    dict_strs = result.split('||')
    if not dict_strs:
        dict_strs = result.split('|||')
        if not dict_strs:
            return dct
    if len(dict_strs) == 3:
        dct['title'] = dict_strs[0].strip()
        dct['content'] = dict_strs[1].strip()
    elif len(dict_strs) == 4:
        dct['title'] = dict_strs[0].strip()
        dct['content'] = dict_strs[2].strip()
        dct['abstract'] = dict_strs[1].strip()
    else:
        return dct
    date_str = extract_and_convert_dates(dict_strs[-1])
    if date_str:
        dct['publish_time'] = date_str
    else:
        dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
    return dct


sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
"""
Title||Summary||Content||Release Date YYYY-MM-DD
"""
'''


async def llm_crawler(url: str, logger) -> (int, dict):
    async with httpx.AsyncClient() as client:
        for retry in range(2):
            try:
                response = await client.get(url, headers=header, timeout=30)
                response.raise_for_status()
                break
            except Exception as e:
                if retry < 1:
                    logger.info(f"request {url} got error {e}\nwaiting 1min")
                    await asyncio.sleep(60)
                else:
                    logger.warning(f"request {url} got error {e}")
                    return -7, {}

        rawdata = response.content
        encoding = chardet.detect(rawdata)['encoding']
        text = rawdata.decode(encoding, errors='replace')
        soup = BeautifulSoup(text, "html.parser")
        html_text = text_from_soup(soup)
        html_lines = html_text.split('\n')
        html_lines = [line.strip() for line in html_lines if line.strip()]
        html_text = "\n".join(html_lines)
        if len(html_text) > 29999:
            logger.warning(f"{url} content too long for llm parsing")
            return 0, {}

        if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
                '您访问的页面') or html_text.startswith('403') \
                or html_text.startswith('出错了'):
            logger.warning(f"can not get {url} from the Internet")
            return -7, {}

    messages = [
        {"role": "system", "content": sys_info},
        {"role": "user", "content": html_text}
    ]
    llm_output = openai_llm(messages, model=model, logger=logger)
    try:
        info = parse_html_content(llm_output)
    except:
        msg = f"can not parse {llm_output}"
        logger.debug(msg)
        return 0, {}

    if len(info['title']) < 4 or len(info['content']) < 24:
        logger.debug(f"{info} not valid")
        return 0, {}

    info["url"] = url
    # Extract the picture link, it will be empty if it cannot be extracted.
    image_links = []
    images = soup.find_all("img")

    for img in images:
        try:
            image_links.append(img["src"])
        except KeyError:
            continue
    info["images"] = image_links

    # Extract the author information, if it cannot be extracted, it will be empty.
    author_element = soup.find("meta", {"name": "author"})
    if author_element:
        info["author"] = author_element["content"]
    else:
        info["author"] = ""

    from_site = urlparse(url).netloc
    from_site = from_site.replace('www.', '')
    from_site = from_site.split('.')[0]
    info['content'] = f"[from {from_site}] {info['content']}"

    if not info['abstract']:
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description:
            info['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
        else:
            info['abstract'] = ''

    return 11, info


async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
    async with httpx.AsyncClient() as client:
        for retry in range(2):
            try:
                response = await client.get(site, headers=header, timeout=30)
                response.raise_for_status()
                break
            except Exception as e:
                if retry < 1:
                    logger.info(f"request {site} got error {e}\nwaiting 1min")
                    await asyncio.sleep(60)
                else:
                    logger.warning(f"request {site} got error {e}")
                    return []

        page_source = response.text
        soup = BeautifulSoup(page_source, "html.parser")
        # Parse all URLs
        parsed_url = urlparse(site)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]

    if not urls:
        # maybe it's an article site
        logger.warning(f"can not find any link from {site}, maybe it's an article site...")
        if site in existing:
            logger.debug(f"{site} has been crawled before, skip it")
            return []

        if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):
            flag, result = await mp_crawler(site, logger)
        else:
            flag, result = await simple_crawler(site, logger)

        if flag == -7:
            #  -7 means cannot fetch the html, and other crawlers have no effect.
            return []

        if flag != 11:
            flag, result = await llm_crawler(site, logger)
            if flag != 11:
                return []

        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
            logger.debug(f"{site} is too old, skip it")
            return []
        else:
            return [result]
    # Then gradually analyze the article, still use simple_crawler first, no use llm_crawler
    articles = []
    for url in urls:
        if url in existing:
            logger.debug(f"{url} has been crawled before, skip it")
            continue

        existing.append(url)

        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
            flag, result = await mp_crawler(url, logger)
        else:
            flag, result = await simple_crawler(url, logger)

        if flag == -7:
            #  -7 means cannot fetch the html, and other crawlers have no effect.
            continue

        if flag != 11:
            flag, result = await llm_crawler(url, logger)
            if flag != 11:
                continue

        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
            logger.debug(f"{url} is too old, skip it")
        else:
            articles.append(result)

    return articles
scrapers updated 2024-06-15 15:41:31 +08:00			`# -- coding: utf-8 --`

code review 2024-06-14 09:08:12 +08:00			`import os`
initial commit 2024-04-07 09:37:47 +08:00			`from urllib.parse import urlparse`
			`import re`
			`from .simple_crawler import simple_crawler`
scrapers updated 2024-06-15 15:41:31 +08:00			`from .mp_crawler import mp_crawler`
improve the crawler 2024-04-09 11:38:51 +08:00			`import httpx`
initial commit 2024-04-07 09:37:47 +08:00			`from bs4 import BeautifulSoup`
			`from bs4.element import Comment`
scrapers updated 2024-06-15 15:41:31 +08:00			`from llms.openai_wrapper import openai_llm`
			`# from llms.siliconflow_wrapper import sfa_llm`
initial commit 2024-04-07 09:37:47 +08:00			`from datetime import datetime, date`
			`from requests.compat import urljoin`
more strictly crawler filter 2024-04-08 17:58:29 +08:00			`import chardet`
scrapers updated 2024-06-15 15:41:31 +08:00			`from utils.general_utils import extract_and_convert_dates`
			`import asyncio`
initial commit 2024-04-07 09:37:47 +08:00

code review 2024-06-14 09:08:12 +08:00			`model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')`
abort selenium 2024-04-07 21:40:26 +08:00			`header = {`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}`
initial commit 2024-04-07 09:37:47 +08:00

			`def tag_visible(element: Comment) -> bool:`
			`if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:`
			`return False`
			`if isinstance(element, Comment):`
			`return False`
			`return True`


			`def text_from_soup(soup: BeautifulSoup) -> str:`
			`res = []`
			`texts = soup.find_all(string=True)`
			`visible_texts = filter(tag_visible, texts)`
			`for v in visible_texts:`
			`res.append(v)`
			`text = "\n".join(res)`
			`return text.strip()`


			`def parse_html_content(out: str) -> dict:`
improve the crawler 2024-04-09 11:38:51 +08:00			`dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}`
more strictly crawler filter 2024-04-08 17:58:29 +08:00			`pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)`
			`result = pattern.findall(out)`
improve the crawler 2024-04-09 11:38:51 +08:00			`result = result[0].strip()`
			`dict_strs = result.split('\|\|')`
			`if not dict_strs:`
			`dict_strs = result.split('\|\|\|')`
			`if not dict_strs:`
			`return dct`
			`if len(dict_strs) == 3:`
			`dct['title'] = dict_strs[0].strip()`
			`dct['content'] = dict_strs[1].strip()`
			`elif len(dict_strs) == 4:`
			`dct['title'] = dict_strs[0].strip()`
			`dct['content'] = dict_strs[2].strip()`
			`dct['abstract'] = dict_strs[1].strip()`
			`else:`
			`return dct`
			`date_str = extract_and_convert_dates(dict_strs[-1])`
initial commit 2024-04-07 09:37:47 +08:00			`if date_str:`
improve the crawler 2024-04-09 11:38:51 +08:00			`dct['publish_time'] = date_str`
initial commit 2024-04-07 09:37:47 +08:00			`else:`
improve the crawler 2024-04-09 11:38:51 +08:00			`dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")`
initial commit 2024-04-07 09:37:47 +08:00			`return dct`


code review 2024-06-14 09:08:12 +08:00			`sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):`
initial commit 2024-04-07 09:37:47 +08:00			`"""`
code review 2024-06-14 09:08:12 +08:00			`Title\|\|Summary\|\|Content\|\|Release Date YYYY-MM-DD`
improve the crawler 2024-04-09 11:38:51 +08:00			`"""`
			`'''`
initial commit 2024-04-07 09:37:47 +08:00

scrapers updated 2024-06-15 15:41:31 +08:00			`async def llm_crawler(url: str, logger) -> (int, dict):`
			`async with httpx.AsyncClient() as client:`
			`for retry in range(2):`
			`try:`
			`response = await client.get(url, headers=header, timeout=30)`
			`response.raise_for_status()`
			`break`
			`except Exception as e:`
			`if retry < 1:`
			`logger.info(f"request {url} got error {e}\nwaiting 1min")`
			`await asyncio.sleep(60)`
			`else:`
			`logger.warning(f"request {url} got error {e}")`
			`return -7, {}`

			`rawdata = response.content`
			`encoding = chardet.detect(rawdata)['encoding']`
			`text = rawdata.decode(encoding, errors='replace')`
			`soup = BeautifulSoup(text, "html.parser")`
			`html_text = text_from_soup(soup)`
			`html_lines = html_text.split('\n')`
			`html_lines = [line.strip() for line in html_lines if line.strip()]`
			`html_text = "\n".join(html_lines)`
			`if len(html_text) > 29999:`
			`logger.warning(f"{url} content too long for llm parsing")`
			`return 0, {}`

			`if not html_text or html_text.startswith('服务器错误') or html_text.startswith(`
			`'您访问的页面') or html_text.startswith('403') \`
			`or html_text.startswith('出错了'):`
			`logger.warning(f"can not get {url} from the Internet")`
			`return -7, {}`
initial commit 2024-04-07 09:37:47 +08:00
			`messages = [`
			`{"role": "system", "content": sys_info},`
			`{"role": "user", "content": html_text}`
			`]`
code review 2024-06-14 09:08:12 +08:00			`llm_output = openai_llm(messages, model=model, logger=logger)`
initial commit 2024-04-07 09:37:47 +08:00			`try:`
			`info = parse_html_content(llm_output)`
scrapers updated 2024-06-15 15:41:31 +08:00			`except:`
initial commit 2024-04-07 09:37:47 +08:00			`msg = f"can not parse {llm_output}"`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.debug(msg)`
initial commit 2024-04-07 09:37:47 +08:00			`return 0, {}`

improve the crawler 2024-04-09 11:38:51 +08:00			`if len(info['title']) < 4 or len(info['content']) < 24:`
			`logger.debug(f"{info} not valid")`
initial commit 2024-04-07 09:37:47 +08:00			`return 0, {}`

code review 2024-06-14 09:08:12 +08:00			`info["url"] = url`
			`# Extract the picture link, it will be empty if it cannot be extracted.`
initial commit 2024-04-07 09:37:47 +08:00			`image_links = []`
			`images = soup.find_all("img")`

			`for img in images:`
			`try:`
			`image_links.append(img["src"])`
			`except KeyError:`
			`continue`
			`info["images"] = image_links`

code review 2024-06-14 09:08:12 +08:00			`# Extract the author information, if it cannot be extracted, it will be empty.`
initial commit 2024-04-07 09:37:47 +08:00			`author_element = soup.find("meta", {"name": "author"})`
			`if author_element:`
			`info["author"] = author_element["content"]`
			`else:`
			`info["author"] = ""`

code review 2024-06-14 09:08:12 +08:00			`from_site = urlparse(url).netloc`
			`from_site = from_site.replace('www.', '')`
			`from_site = from_site.split('.')[0]`
			`info['content'] = f"[from {from_site}] {info['content']}"`

abort selenium 2024-04-07 21:40:26 +08:00			`if not info['abstract']:`
			`meta_description = soup.find("meta", {"name": "description"})`
			`if meta_description:`
code review 2024-06-14 09:08:12 +08:00			`info['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"`
abort selenium 2024-04-07 21:40:26 +08:00			`else:`
			`info['abstract'] = ''`

initial commit 2024-04-07 09:37:47 +08:00			`return 11, info`


scrapers updated 2024-06-15 15:41:31 +08:00			`async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:`
			`async with httpx.AsyncClient() as client:`
			`for retry in range(2):`
			`try:`
			`response = await client.get(site, headers=header, timeout=30)`
			`response.raise_for_status()`
			`break`
			`except Exception as e:`
			`if retry < 1:`
			`logger.info(f"request {site} got error {e}\nwaiting 1min")`
			`await asyncio.sleep(60)`
			`else:`
			`logger.warning(f"request {site} got error {e}")`
			`return []`

			`page_source = response.text`
			`soup = BeautifulSoup(page_source, "html.parser")`
			`# Parse all URLs`
			`parsed_url = urlparse(site)`
			`base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"`
			`urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]`

initial commit 2024-04-07 09:37:47 +08:00			`if not urls:`
code review 2024-06-14 09:08:12 +08:00			`# maybe it's an article site`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.warning(f"can not find any link from {site}, maybe it's an article site...")`
initial commit 2024-04-07 09:37:47 +08:00			`if site in existing:`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.debug(f"{site} has been crawled before, skip it")`
initial commit 2024-04-07 09:37:47 +08:00			`return []`
scrapers updated 2024-06-15 15:41:31 +08:00
			`if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):`
			`flag, result = await mp_crawler(site, logger)`
			`else:`
			`flag, result = await simple_crawler(site, logger)`

			`if flag == -7:`
			`# -7 means cannot fetch the html, and other crawlers have no effect.`
			`return []`

initial commit 2024-04-07 09:37:47 +08:00			`if flag != 11:`
scrapers updated 2024-06-15 15:41:31 +08:00			`flag, result = await llm_crawler(site, logger)`
initial commit 2024-04-07 09:37:47 +08:00			`if flag != 11:`
			`return []`
scrapers updated 2024-06-15 15:41:31 +08:00
abort selenium 2024-04-07 21:40:26 +08:00			`publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')`
			`if publish_date.date() < expiration:`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.debug(f"{site} is too old, skip it")`
initial commit 2024-04-07 09:37:47 +08:00			`return []`
			`else:`
			`return [result]`
code review 2024-06-14 09:08:12 +08:00			`# Then gradually analyze the article, still use simple_crawler first, no use llm_crawler`
initial commit 2024-04-07 09:37:47 +08:00			`articles = []`
			`for url in urls:`
			`if url in existing:`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.debug(f"{url} has been crawled before, skip it")`
initial commit 2024-04-07 09:37:47 +08:00			`continue`
scrapers updated 2024-06-15 15:41:31 +08:00
abort selenium 2024-04-07 21:40:26 +08:00			`existing.append(url)`
scrapers updated 2024-06-15 15:41:31 +08:00
			`if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):`
			`flag, result = await mp_crawler(url, logger)`
			`else:`
			`flag, result = await simple_crawler(url, logger)`

			`if flag == -7:`
			`# -7 means cannot fetch the html, and other crawlers have no effect.`
			`continue`

initial commit 2024-04-07 09:37:47 +08:00			`if flag != 11:`
scrapers updated 2024-06-15 15:41:31 +08:00			`flag, result = await llm_crawler(url, logger)`
initial commit 2024-04-07 09:37:47 +08:00			`if flag != 11:`
			`continue`
scrapers updated 2024-06-15 15:41:31 +08:00
abort selenium 2024-04-07 21:40:26 +08:00			`publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')`
			`if publish_date.date() < expiration:`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.debug(f"{url} is too old, skip it")`
initial commit 2024-04-07 09:37:47 +08:00			`else:`
			`articles.append(result)`

			`return articles`