wiseflow/core/scrapers/general_crawler.py

# -*- coding: utf-8 -*-
# when you use this general crawler, remember followings
# When you receive flag -7, it means that the problem occurs in the HTML fetch process.
# When you receive flag 0, it means that the problem occurred during the content parsing process.
# when you receive flag 1, the result would be a tuple, means that the input url is possible a article_list page
# and the set contains the url of the articles.
# when you receive flag 11, you will get the dict contains the title, content, url, date, and the source of the article.

from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from bs4.element import Comment
from utils.general_utils import extract_and_convert_dates
import asyncio
import json_repair
import os
from typing import Union
from requests.compat import urljoin
from scrapers import scraper_map


model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
extractor = GeneralNewsExtractor()


def tag_visible(element: Comment) -> bool:
    if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_soup(soup: BeautifulSoup) -> str:
    res = []
    texts = soup.find_all(string=True)
    visible_texts = filter(tag_visible, texts)
    for v in visible_texts:
        res.append(v)
    text = "\n".join(res)
    return text.strip()


sys_info = '''Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content:

- The document's title
- The complete main content, as it appears in the HTML, comprising all textual elements considered part of the core article body
- The publication time in its original format found within the HTML

Ensure your response fits the following JSON structure, accurately reflecting the extracted data without modification:

```json
{
  "title": "The Document's Exact Title",
  "content": "All the unaltered primary text content from the article",
  "publish_time": "Original Publication Time as per HTML"
}
```

It is essential that your output adheres strictly to this format, with each field filled based on the untouched information extracted directly from the HTML source.'''


async def general_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
    """
    Return article information dict and flag, negative number is error, 0 is no result, 1 is for article_list page,
    11 is success

    main work flow:
    (for weixin public account articles, which startswith mp.weixin.qq use mp_crawler)
    first get the content with httpx
    then judge is article list (return all article url and flag 1) or article detail page
    then try to use gne to extract the information
    when fail, try to use a llm to analysis the html
    """

    # 0. if there's a scraper for this domain, use it (such as mp.weixin.qq.com)
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    base_url = f"{parsed_url.scheme}://{domain}"
    if domain in scraper_map:
        return await scraper_map[domain](url, logger)

    # 1. get the content with httpx
    async with httpx.AsyncClient() as client:
        for retry in range(2):
            try:
                response = await client.get(url, headers=header, timeout=30)
                response.raise_for_status()
                break
            except Exception as e:
                if retry < 1:
                    logger.info(f"can not reach\n{e}\nwaiting 1min")
                    await asyncio.sleep(60)
                else:
                    logger.error(e)
                    return -7, {}

    # 2. judge is article list (return all article url and flag 1) or article detail page
        page_source = response.text
        if page_source:
            text = page_source
        else:
            try:
                text = response.content.decode('utf-8')
            except UnicodeDecodeError:
                try:
                    text = response.content.decode('gbk')
                except Exception as e:
                    logger.error(f"can not decode html {e}")
                    return -7, {}

        soup = BeautifulSoup(text, "html.parser")
        # Note: The scheme used here is very crude,
        # it is recommended to write a separate parser for specific business scenarios
        # Parse all URLs
        if len(url) < 50:
            urls = set()
            for link in soup.find_all("a", href=True):
                absolute_url = urljoin(base_url, link["href"])
                format_url = urlparse(absolute_url)
                # only record same domain links
                if not format_url.netloc or format_url.netloc != domain:
                    continue
                # remove hash fragment
                absolute_url = f"{format_url.scheme}://{format_url.netloc}{format_url.path}{format_url.params}{format_url.query}"
                if absolute_url != url:
                    urls.add(absolute_url)

            if len(urls) > 24:
                logger.info(f"{url} is more like an article list page, find {len(urls)} urls with the same netloc")
                return 1, urls

    # 3. try to use gne to extract the information
    try:
        result = extractor.extract(text)
        if 'meta' in result:
            del result['meta']

        if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
            'title'].startswith('403') \
                or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
            logger.warning(f"can not get {url} from the Internet")
            return -7, {}

        if len(result['title']) < 4 or len(result['content']) < 24:
            logger.info(f"gne extract not good: {result}")
            result = None
    except Exception as e:
        logger.info(f"gne extract error: {e}")
        result = None

    # 4. try to use a llm to analysis the html
    if not result:
        html_text = text_from_soup(soup)
        html_lines = html_text.split('\n')
        html_lines = [line.strip() for line in html_lines if line.strip()]
        html_text = "\n".join(html_lines)
        if len(html_text) > 29999:
            logger.info(f"{url} content too long for llm parsing")
            return 0, {}

        if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
                '您访问的页面') or html_text.startswith('403') \
                or html_text.startswith('出错了'):
            logger.warning(f"can not get {url} from the Internet")
            return -7, {}

        messages = [
            {"role": "system", "content": sys_info},
            {"role": "user", "content": html_text}
        ]
        llm_output = openai_llm(messages, model=model, logger=logger, temperature=0.01)
        result = json_repair.repair_json(llm_output, return_objects=True)
        logger.debug(f"decoded_object: {result}")

        if not isinstance(result, dict):
            logger.debug("failed to parse from llm output")
            return 0, {}

        if 'title' not in result or 'content' not in result:
            logger.debug("llm parsed result not good")
            return 0, {}

        # Extract the picture link, it will be empty if it cannot be extracted.
        image_links = []
        images = soup.find_all("img")
        for img in images:
            try:
                image_links.append(urljoin(base_url, img["src"]))
            except KeyError:
                continue
        result["images"] = image_links

        # Extract the author information, if it cannot be extracted, it will be empty.
        author_element = soup.find("meta", {"name": "author"})
        if author_element:
            result["author"] = author_element["content"]
        else:
            result["author"] = ""

    # 5. post process
    date_str = extract_and_convert_dates(result['publish_time'])
    if date_str:
        result['publish_time'] = date_str
    else:
        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")

    from_site = domain.replace('www.', '')
    from_site = from_site.split('.')[0]
    result['content'] = f"[from {from_site}] {result['content']}"

    try:
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description:
            result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
        else:
            result['abstract'] = ''
    except Exception:
        result['abstract'] = ''

    result['url'] = url
    return 11,  result