mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
204 lines
7.0 KiB
Python
204 lines
7.0 KiB
Python
from pathlib import Path
|
||
from urllib.parse import urlparse
|
||
import re
|
||
from .simple_crawler import simple_crawler
|
||
import httpx
|
||
from bs4 import BeautifulSoup
|
||
from bs4.element import Comment
|
||
from llms.dashscope_wrapper import dashscope_llm
|
||
# from llms.openai_wrapper import openai_llm
|
||
# from llms.siliconflow_wrapper import sfa_llm
|
||
from datetime import datetime, date
|
||
from requests.compat import urljoin
|
||
import chardet
|
||
from utils.general_utils import extract_and_convert_dates
|
||
|
||
|
||
model = "qwen-long"
|
||
# model = "deepseek-chat"
|
||
header = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||
|
||
|
||
def tag_visible(element: Comment) -> bool:
|
||
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
|
||
return False
|
||
if isinstance(element, Comment):
|
||
return False
|
||
return True
|
||
|
||
|
||
def text_from_soup(soup: BeautifulSoup) -> str:
|
||
res = []
|
||
texts = soup.find_all(string=True)
|
||
visible_texts = filter(tag_visible, texts)
|
||
for v in visible_texts:
|
||
res.append(v)
|
||
text = "\n".join(res)
|
||
return text.strip()
|
||
|
||
|
||
def parse_html_content(out: str) -> dict:
|
||
dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
|
||
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
||
result = pattern.findall(out)
|
||
result = result[0].strip()
|
||
dict_strs = result.split('||')
|
||
if not dict_strs:
|
||
dict_strs = result.split('|||')
|
||
if not dict_strs:
|
||
return dct
|
||
if len(dict_strs) == 3:
|
||
dct['title'] = dict_strs[0].strip()
|
||
dct['content'] = dict_strs[1].strip()
|
||
elif len(dict_strs) == 4:
|
||
dct['title'] = dict_strs[0].strip()
|
||
dct['content'] = dict_strs[2].strip()
|
||
dct['abstract'] = dict_strs[1].strip()
|
||
else:
|
||
return dct
|
||
date_str = extract_and_convert_dates(dict_strs[-1])
|
||
if date_str:
|
||
dct['publish_time'] = date_str
|
||
else:
|
||
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
||
return dct
|
||
|
||
|
||
# qwen1.5-72b解析json格式太容易出错,网页上的情况太多,比如经常直接使用英文的",这样后面json.loads就容易出错……
|
||
sys_info = '''你是一个html解析器,你将接收一段html代码,请解析出其标题、摘要、内容和发布日期,发布日期格式为YYYY-MM-DD。
|
||
结果请按照以下格式返回(整体用三引号包裹):
|
||
"""
|
||
标题||摘要||内容||发布日期XXXX-XX-XX
|
||
"""
|
||
'''
|
||
|
||
|
||
def llm_crawler(url: str | Path, logger) -> (int, dict):
|
||
"""
|
||
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
||
参考:https://mp.weixin.qq.com/s/4J-kofsfFDiV1FxGlTJLfA
|
||
"""
|
||
# 发送 HTTP 请求获取网页内容
|
||
try:
|
||
with httpx.Client() as client:
|
||
response = client.get(url, headers=header, timeout=30)
|
||
rawdata = response.content
|
||
encoding = chardet.detect(rawdata)['encoding']
|
||
text = rawdata.decode(encoding)
|
||
except Exception as e:
|
||
logger.error(e)
|
||
return -7, {}
|
||
|
||
# 使用 BeautifulSoup 解析 HTML 内容
|
||
soup = BeautifulSoup(text, "html.parser")
|
||
html_text = text_from_soup(soup)
|
||
html_lines = html_text.split('\n')
|
||
html_lines = [line.strip() for line in html_lines if line.strip()]
|
||
html_text = "\n".join(html_lines)
|
||
if len(html_text) > 29999:
|
||
logger.warning(f"{url} content too long for llm parsing")
|
||
return 0, {}
|
||
|
||
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
|
||
or html_text.startswith('出错了'):
|
||
logger.warning(f"can not get {url} from the Internet")
|
||
return -7, {}
|
||
|
||
messages = [
|
||
{"role": "system", "content": sys_info},
|
||
{"role": "user", "content": html_text}
|
||
]
|
||
llm_output = dashscope_llm(messages, model=model, logger=logger)
|
||
# llm_output = openai_llm(messages, model=model, logger=logger)
|
||
try:
|
||
info = parse_html_content(llm_output)
|
||
except Exception:
|
||
msg = f"can not parse {llm_output}"
|
||
logger.debug(msg)
|
||
return 0, {}
|
||
|
||
if len(info['title']) < 4 or len(info['content']) < 24:
|
||
logger.debug(f"{info} not valid")
|
||
return 0, {}
|
||
|
||
info["url"] = str(url)
|
||
# 提取图片链接,提取不到就空着
|
||
image_links = []
|
||
images = soup.find_all("img")
|
||
|
||
for img in images:
|
||
try:
|
||
# 不是所有网站都是这个结构,比如公众号文章的就不行。
|
||
image_links.append(img["src"])
|
||
except KeyError:
|
||
continue
|
||
info["images"] = image_links
|
||
|
||
# 提取作者信息,提取不到就空着
|
||
author_element = soup.find("meta", {"name": "author"})
|
||
if author_element:
|
||
info["author"] = author_element["content"]
|
||
else:
|
||
info["author"] = ""
|
||
|
||
if not info['abstract']:
|
||
meta_description = soup.find("meta", {"name": "description"})
|
||
if meta_description:
|
||
info['abstract'] = meta_description["content"]
|
||
else:
|
||
info['abstract'] = ''
|
||
|
||
return 11, info
|
||
|
||
|
||
def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
|
||
try:
|
||
with httpx.Client() as client:
|
||
response = client.get(site, headers=header, timeout=30)
|
||
except Exception as e:
|
||
logger.error(e)
|
||
return []
|
||
|
||
page_source = response.text
|
||
soup = BeautifulSoup(page_source, "html.parser")
|
||
# 解析出全部超链接网址
|
||
parsed_url = urlparse(site)
|
||
base_url = parsed_url.scheme + '://' + parsed_url.netloc
|
||
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
|
||
if not urls:
|
||
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
|
||
if site in existing:
|
||
logger.debug(f"{site} has been crawled before, skip it")
|
||
return []
|
||
flag, result = simple_crawler(site, logger)
|
||
if flag != 11:
|
||
flag, result = llm_crawler(site, logger)
|
||
if flag != 11:
|
||
return []
|
||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||
if publish_date.date() < expiration:
|
||
logger.debug(f"{site} is too old, skip it")
|
||
return []
|
||
else:
|
||
return [result]
|
||
# 再逐步解析文章,依然是先使用simple_crawler, 不行再用llm_crawler
|
||
articles = []
|
||
for url in urls:
|
||
if url in existing:
|
||
logger.debug(f"{url} has been crawled before, skip it")
|
||
continue
|
||
existing.append(url)
|
||
flag, result = simple_crawler(url, logger)
|
||
if flag != 11:
|
||
flag, result = llm_crawler(url, logger)
|
||
if flag != 11:
|
||
continue
|
||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||
if publish_date.date() < expiration:
|
||
logger.debug(f"{url} is too old, skip it")
|
||
else:
|
||
articles.append(result)
|
||
|
||
return articles
|