wiseflow/core/scrapers/general_crawler.py

# -*- coding: utf-8 -*-
# when you use this general crawler, remember followings
# When you receive flag -7, it means that the problem occurs in the HTML fetch process.
# When you receive flag 0, it means that the problem occurred during the content parsing process.

from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from bs4.element import Comment
import chardet
from utils.general_utils import extract_and_convert_dates
import asyncio
import json_repair
import os


model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
extractor = GeneralNewsExtractor()


def tag_visible(element: Comment) -> bool:
    if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_soup(soup: BeautifulSoup) -> str:
    res = []
    texts = soup.find_all(string=True)
    visible_texts = filter(tag_visible, texts)
    for v in visible_texts:
        res.append(v)
    text = "\n".join(res)
    return text.strip()


sys_info = '''Your role is to function as an HTML parser, tasked with analyzing a segment of HTML code. Extract the following metadata from the given HTML snippet: the document's title, summary or abstract, main content, and the publication date. Ensure that your response adheres to the JSON format outlined below, encapsulating the extracted information accurately:

```json
{
  "title": "The Document's Title",
  "abstract": "A concise overview or summary of the content",
  "content": "The primary textual content of the article",
  "publish_date": "The publication date in YYYY-MM-DD format"
}
```

Please structure your output precisely as demonstrated, with each field populated correspondingly to the details found within the HTML code.
'''


async def general_crawler(url: str, logger) -> (int, dict):
    """
    Return article information dict and flag, negative number is error, 0 is no result, 11 is success

    main work flow:
    first get the content with httpx
    then try to use gne to extract the information
    when fail, try to use a llm to analysis the html
    """
    async with httpx.AsyncClient() as client:
        for retry in range(2):
            try:
                response = await client.get(url, headers=header, timeout=30)
                response.raise_for_status()
                break
            except Exception as e:
                if retry < 1:
                    logger.info(f"request {url} got error {e}\nwaiting 1min")
                    await asyncio.sleep(60)
                else:
                    logger.warning(f"request {url} got error {e}")
                    return -7, {}

        rawdata = response.content
        encoding = chardet.detect(rawdata)['encoding']
        text = rawdata.decode(encoding, errors='replace')
        soup = BeautifulSoup(text, "html.parser")

    try:
        result = extractor.extract(text)
    except Exception as e:
        logger.info(f"gne extract error: {e}")
        result = None

    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
        'title'].startswith('403') \
            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
        logger.warning(f"can not get {url} from the Internet")
        return -7, {}

    if len(result['title']) < 4 or len(result['content']) < 24:
        logger.info(f"gne extract not good: {result}")
        result = None

    if result:
        info = result
        abstract = ''
    else:
        html_text = text_from_soup(soup)
        html_lines = html_text.split('\n')
        html_lines = [line.strip() for line in html_lines if line.strip()]
        html_text = "\n".join(html_lines)
        if len(html_text) > 29999:
            logger.info(f"{url} content too long for llm parsing")
            return 0, {}

        if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
                '您访问的页面') or html_text.startswith('403') \
                or html_text.startswith('出错了'):
            logger.warning(f"can not get {url} from the Internet")
            return -7, {}

        messages = [
            {"role": "system", "content": sys_info},
            {"role": "user", "content": html_text}
        ]
        llm_output = openai_llm(messages, model=model, logger=logger)
        decoded_object = json_repair.repair_json(llm_output, return_objects=True)
        logger.debug(f"decoded_object: {decoded_object}")

        if not isinstance(decoded_object, dict):
            logger.debug("failed to parse from llm output")
            return 0, {}

        if 'title' not in decoded_object or 'content' not in decoded_object:
            logger.debug("llm parsed result not good")
            return 0, {}

        info = {'title': decoded_object['title'], 'content': decoded_object['content']}
        abstract = decoded_object.get('abstract', '')
        info['publish_time'] = decoded_object.get('publish_date', '')

        # Extract the picture link, it will be empty if it cannot be extracted.
        image_links = []
        images = soup.find_all("img")

        for img in images:
            try:
                image_links.append(img["src"])
            except KeyError:
                continue
        info["images"] = image_links

        # Extract the author information, if it cannot be extracted, it will be empty.
        author_element = soup.find("meta", {"name": "author"})
        if author_element:
            info["author"] = author_element["content"]
        else:
            info["author"] = ""

    date_str = extract_and_convert_dates(info['publish_time'])
    if date_str:
        info['publish_time'] = date_str
    else:
        info['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")

    from_site = urlparse(url).netloc
    from_site = from_site.replace('www.', '')
    from_site = from_site.split('.')[0]
    info['content'] = f"[from {from_site}] {info['content']}"

    try:
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description:
            info['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
        else:
            if abstract:
                info['abstract'] = f"[from {from_site}] {abstract.strip()}"
            else:
                info['abstract'] = ''
    except Exception:
        if abstract:
            info['abstract'] = f"[from {from_site}] {abstract.strip()}"
        else:
            info['abstract'] = ''

    info['url'] = url
    return 11, info