wiseflow/core/scrapers/general_crawler.py

220 lines
8.6 KiB
Python
Raw Normal View History

2024-06-16 14:33:21 +08:00
# -*- coding: utf-8 -*-
# when you use this general crawler, remember followings
# When you receive flag -7, it means that the problem occurs in the HTML fetch process.
# When you receive flag 0, it means that the problem occurred during the content parsing process.
2024-06-19 10:05:10 +08:00
# when you receive flag 1, the result would be a list, means that the input url is possible a article_list page and the list contains the url of the articles.
# when you receive flag 11, you will get the dict contains the title, content, url, date, and the source of the article.
2024-06-16 14:33:21 +08:00
from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from bs4.element import Comment
from utils.general_utils import extract_and_convert_dates
import asyncio
import json_repair
import os
2024-06-19 10:05:10 +08:00
from typing import Union
from requests.compat import urljoin
from scrapers import scraper_map
2024-06-16 14:33:21 +08:00
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
extractor = GeneralNewsExtractor()
def tag_visible(element: Comment) -> bool:
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
return False
if isinstance(element, Comment):
return False
return True
def text_from_soup(soup: BeautifulSoup) -> str:
res = []
texts = soup.find_all(string=True)
visible_texts = filter(tag_visible, texts)
for v in visible_texts:
res.append(v)
text = "\n".join(res)
return text.strip()
2024-06-19 10:05:10 +08:00
sys_info = '''Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content:
- The document's title
- The complete main content, as it appears in the HTML, comprising all textual elements considered part of the core article body
- The publication time in its original format found within the HTML
Ensure your response fits the following JSON structure, accurately reflecting the extracted data without modification:
2024-06-16 14:33:21 +08:00
```json
{
2024-06-19 10:05:10 +08:00
"title": "The Document's Exact Title",
"content": "All the unaltered primary text content from the article",
"publish_time": "Original Publication Time as per HTML"
2024-06-16 14:33:21 +08:00
}
```
2024-06-19 10:05:10 +08:00
It is essential that your output adheres strictly to this format, with each field filled based on the untouched information extracted directly from the HTML source.'''
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
async def general_crawler(url: str, logger) -> tuple[int, Union[list, dict]]:
2024-06-16 14:33:21 +08:00
"""
2024-06-19 10:05:10 +08:00
Return article information dict and flag, negative number is error, 0 is no result, 1 is for article_list page, 11 is success
2024-06-16 14:33:21 +08:00
main work flow:
2024-06-19 10:05:10 +08:00
(for weixin public account artilces, which startswith mp.weixin.qq use mp_crawler)
2024-06-16 14:33:21 +08:00
first get the content with httpx
2024-06-19 10:05:10 +08:00
then judge is article list (return all article url and flag 1) or article detail page
2024-06-16 14:33:21 +08:00
then try to use gne to extract the information
when fail, try to use a llm to analysis the html
"""
2024-06-19 10:05:10 +08:00
# 0. if there's a scraper for this domain, use it (such as mp.weixin.qq.com)
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in scraper_map:
return await scraper_map[domain](url, logger)
# 1. get the content with httpx
2024-06-16 14:33:21 +08:00
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
2024-06-19 10:05:10 +08:00
logger.info(f"can not reach\n{e}\nwaiting 1min")
2024-06-16 14:33:21 +08:00
await asyncio.sleep(60)
else:
2024-06-19 10:05:10 +08:00
logger.error(e)
2024-06-16 14:33:21 +08:00
return -7, {}
2024-06-19 10:05:10 +08:00
# 2. judge is article list (return all article url and flag 1) or article detail page
page_source = response.text
if page_source:
text = page_source
else:
try:
text = response.content.decode('utf-8')
except UnicodeDecodeError:
try:
text = response.content.decode('gbk')
except Exception as e:
logger.error(f"can not decode html {e}")
return -7, {}
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
soup = BeautifulSoup(text, "html.parser")
2024-06-19 20:00:53 +08:00
# Note: The scheme used here is very crude, and it is recommended to write a separate parser for specific business scenarios
2024-06-19 10:05:10 +08:00
# Parse all URLs
2024-06-19 20:00:53 +08:00
if len(url) < 50:
base_url = f"{parsed_url.scheme}://{domain}"
urls = set()
for link in soup.find_all("a", href=True):
2024-06-21 13:55:25 +08:00
absolute_url = urljoin(base_url, link["href"]).rstrip('/')
2024-06-19 20:00:53 +08:00
if urlparse(absolute_url).netloc == domain and absolute_url != url:
urls.add(absolute_url)
if len(urls) > 30:
logger.info(f"{url} is more like an article list page, find {len(urls)} urls with the same netloc")
return 1, list(urls)
2024-06-19 10:05:10 +08:00
# 3. try to use gne to extract the information
2024-06-16 14:33:21 +08:00
try:
result = extractor.extract(text)
2024-06-20 15:01:27 +08:00
if 'meta' in result:
del result['meta']
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
'title'].startswith('403') \
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"gne extract not good: {result}")
result = None
except Exception as e:
logger.info(f"gne extract error: {e}")
2024-06-16 14:33:21 +08:00
result = None
2024-06-19 10:05:10 +08:00
# 4. try to use a llm to analysis the html
if not result:
2024-06-16 14:33:21 +08:00
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
logger.info(f"{url} content too long for llm parsing")
return 0, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
'您访问的页面') or html_text.startswith('403') \
or html_text.startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
messages = [
{"role": "system", "content": sys_info},
{"role": "user", "content": html_text}
]
2024-06-19 10:05:10 +08:00
llm_output = openai_llm(messages, model=model, logger=logger, temperature=0.01)
result = json_repair.repair_json(llm_output, return_objects=True)
logger.debug(f"decoded_object: {result}")
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if not isinstance(result, dict):
2024-06-16 14:33:21 +08:00
logger.debug("failed to parse from llm output")
return 0, {}
2024-06-19 10:05:10 +08:00
if 'title' not in result or 'content' not in result:
2024-06-16 14:33:21 +08:00
logger.debug("llm parsed result not good")
return 0, {}
# Extract the picture link, it will be empty if it cannot be extracted.
image_links = []
images = soup.find_all("img")
for img in images:
try:
image_links.append(img["src"])
except KeyError:
continue
2024-06-19 10:05:10 +08:00
result["images"] = image_links
2024-06-16 14:33:21 +08:00
# Extract the author information, if it cannot be extracted, it will be empty.
author_element = soup.find("meta", {"name": "author"})
if author_element:
2024-06-19 10:05:10 +08:00
result["author"] = author_element["content"]
2024-06-16 14:33:21 +08:00
else:
2024-06-19 10:05:10 +08:00
result["author"] = ""
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
# 5. post process
date_str = extract_and_convert_dates(result['publish_time'])
2024-06-16 14:33:21 +08:00
if date_str:
2024-06-19 10:05:10 +08:00
result['publish_time'] = date_str
2024-06-16 14:33:21 +08:00
else:
2024-06-19 10:05:10 +08:00
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
from_site = domain.replace('www.', '')
2024-06-16 14:33:21 +08:00
from_site = from_site.split('.')[0]
2024-06-19 10:05:10 +08:00
result['content'] = f"[from {from_site}] {result['content']}"
2024-06-16 14:33:21 +08:00
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
2024-06-19 10:05:10 +08:00
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
2024-06-16 14:33:21 +08:00
else:
2024-06-19 10:05:10 +08:00
result['abstract'] = ''
2024-06-16 14:33:21 +08:00
except Exception:
2024-06-19 10:05:10 +08:00
result['abstract'] = ''
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
result['url'] = url
return 11, result