wiseflow/core/scrapers/general_scraper.py

239 lines
8.2 KiB
Python
Raw Normal View History

2024-06-15 15:41:31 +08:00
# -*- coding: utf-8 -*-
2024-06-14 09:08:12 +08:00
import os
2024-04-07 09:37:47 +08:00
from urllib.parse import urlparse
import re
from .simple_crawler import simple_crawler
2024-06-15 15:41:31 +08:00
from .mp_crawler import mp_crawler
2024-04-09 11:38:51 +08:00
import httpx
2024-04-07 09:37:47 +08:00
from bs4 import BeautifulSoup
from bs4.element import Comment
2024-06-15 15:41:31 +08:00
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
2024-04-07 09:37:47 +08:00
from datetime import datetime, date
from requests.compat import urljoin
2024-04-08 17:58:29 +08:00
import chardet
2024-06-15 15:41:31 +08:00
from utils.general_utils import extract_and_convert_dates
import asyncio
2024-04-07 09:37:47 +08:00
2024-06-14 09:08:12 +08:00
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
2024-04-07 21:40:26 +08:00
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
2024-04-07 09:37:47 +08:00
def tag_visible(element: Comment) -> bool:
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
return False
if isinstance(element, Comment):
return False
return True
def text_from_soup(soup: BeautifulSoup) -> str:
res = []
texts = soup.find_all(string=True)
visible_texts = filter(tag_visible, texts)
for v in visible_texts:
res.append(v)
text = "\n".join(res)
return text.strip()
def parse_html_content(out: str) -> dict:
2024-04-09 11:38:51 +08:00
dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
2024-04-08 17:58:29 +08:00
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
result = pattern.findall(out)
2024-04-09 11:38:51 +08:00
result = result[0].strip()
dict_strs = result.split('||')
if not dict_strs:
dict_strs = result.split('|||')
if not dict_strs:
return dct
if len(dict_strs) == 3:
dct['title'] = dict_strs[0].strip()
dct['content'] = dict_strs[1].strip()
elif len(dict_strs) == 4:
dct['title'] = dict_strs[0].strip()
dct['content'] = dict_strs[2].strip()
dct['abstract'] = dict_strs[1].strip()
else:
return dct
date_str = extract_and_convert_dates(dict_strs[-1])
2024-04-07 09:37:47 +08:00
if date_str:
2024-04-09 11:38:51 +08:00
dct['publish_time'] = date_str
2024-04-07 09:37:47 +08:00
else:
2024-04-09 11:38:51 +08:00
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
2024-04-07 09:37:47 +08:00
return dct
2024-06-14 09:08:12 +08:00
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
2024-04-07 09:37:47 +08:00
"""
2024-06-14 09:08:12 +08:00
Title||Summary||Content||Release Date YYYY-MM-DD
2024-04-09 11:38:51 +08:00
"""
'''
2024-04-07 09:37:47 +08:00
2024-06-15 15:41:31 +08:00
async def llm_crawler(url: str, logger) -> (int, dict):
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {url} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {url} got error {e}")
return -7, {}
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding, errors='replace')
soup = BeautifulSoup(text, "html.parser")
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
logger.warning(f"{url} content too long for llm parsing")
return 0, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
'您访问的页面') or html_text.startswith('403') \
or html_text.startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
2024-04-07 09:37:47 +08:00
messages = [
{"role": "system", "content": sys_info},
{"role": "user", "content": html_text}
]
2024-06-14 09:08:12 +08:00
llm_output = openai_llm(messages, model=model, logger=logger)
2024-04-07 09:37:47 +08:00
try:
info = parse_html_content(llm_output)
2024-06-15 15:41:31 +08:00
except:
2024-04-07 09:37:47 +08:00
msg = f"can not parse {llm_output}"
2024-04-09 11:38:51 +08:00
logger.debug(msg)
2024-04-07 09:37:47 +08:00
return 0, {}
2024-04-09 11:38:51 +08:00
if len(info['title']) < 4 or len(info['content']) < 24:
logger.debug(f"{info} not valid")
2024-04-07 09:37:47 +08:00
return 0, {}
2024-06-14 09:08:12 +08:00
info["url"] = url
# Extract the picture link, it will be empty if it cannot be extracted.
2024-04-07 09:37:47 +08:00
image_links = []
images = soup.find_all("img")
for img in images:
try:
image_links.append(img["src"])
except KeyError:
continue
info["images"] = image_links
2024-06-14 09:08:12 +08:00
# Extract the author information, if it cannot be extracted, it will be empty.
2024-04-07 09:37:47 +08:00
author_element = soup.find("meta", {"name": "author"})
if author_element:
info["author"] = author_element["content"]
else:
info["author"] = ""
2024-06-14 09:08:12 +08:00
from_site = urlparse(url).netloc
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
info['content'] = f"[from {from_site}] {info['content']}"
2024-04-07 21:40:26 +08:00
if not info['abstract']:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
2024-06-14 09:08:12 +08:00
info['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
2024-04-07 21:40:26 +08:00
else:
info['abstract'] = ''
2024-04-07 09:37:47 +08:00
return 11, info
2024-06-15 15:41:31 +08:00
async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(site, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {site} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {site} got error {e}")
return []
page_source = response.text
soup = BeautifulSoup(page_source, "html.parser")
# Parse all URLs
parsed_url = urlparse(site)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
2024-04-07 09:37:47 +08:00
if not urls:
2024-06-14 09:08:12 +08:00
# maybe it's an article site
2024-04-09 11:38:51 +08:00
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
2024-04-07 09:37:47 +08:00
if site in existing:
2024-04-09 11:38:51 +08:00
logger.debug(f"{site} has been crawled before, skip it")
2024-04-07 09:37:47 +08:00
return []
2024-06-15 15:41:31 +08:00
if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):
flag, result = await mp_crawler(site, logger)
else:
flag, result = await simple_crawler(site, logger)
if flag == -7:
# -7 means cannot fetch the html, and other crawlers have no effect.
return []
2024-04-07 09:37:47 +08:00
if flag != 11:
2024-06-15 15:41:31 +08:00
flag, result = await llm_crawler(site, logger)
2024-04-07 09:37:47 +08:00
if flag != 11:
return []
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
if publish_date.date() < expiration:
2024-04-09 11:38:51 +08:00
logger.debug(f"{site} is too old, skip it")
2024-04-07 09:37:47 +08:00
return []
else:
return [result]
2024-06-14 09:08:12 +08:00
# Then gradually analyze the article, still use simple_crawler first, no use llm_crawler
2024-04-07 09:37:47 +08:00
articles = []
for url in urls:
if url in existing:
2024-04-09 11:38:51 +08:00
logger.debug(f"{url} has been crawled before, skip it")
2024-04-07 09:37:47 +08:00
continue
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
existing.append(url)
2024-06-15 15:41:31 +08:00
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, result = await mp_crawler(url, logger)
else:
flag, result = await simple_crawler(url, logger)
if flag == -7:
# -7 means cannot fetch the html, and other crawlers have no effect.
continue
2024-04-07 09:37:47 +08:00
if flag != 11:
2024-06-15 15:41:31 +08:00
flag, result = await llm_crawler(url, logger)
2024-04-07 09:37:47 +08:00
if flag != 11:
continue
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
if publish_date.date() < expiration:
2024-04-09 11:38:51 +08:00
logger.debug(f"{url} is too old, skip it")
2024-04-07 09:37:47 +08:00
else:
articles.append(result)
return articles