more strictly crawler filter

This commit is contained in:
bigbrother666 2024-04-08 17:58:29 +08:00
parent 53fc3d0646
commit 25abb316b3
7 changed files with 101 additions and 30 deletions

View File

@ -178,16 +178,16 @@ class BackendService:
return self.build_out(-2, 'insight not found')
article_ids = insight[0]['articles']
if not article_ids:
self.logger.error(f'insight {insight_id} has no articles')
return self.build_out(-2, 'can not find articles for insight')
if article_ids:
article_list = [self.pb.read('articles', fields=['url'], filter=f'id="{_id}"') for _id in article_ids]
url_list = [_article[0]['url'] for _article in article_list if _article]
else:
url_list = []
article_list = [self.pb.read('articles', fields=['url'], filter=f'id="{_id}"') for _id in article_ids]
url_list = [_article[0]['url'] for _article in article_list if _article]
flag, search_result = search_insight(insight[0]['content'], url_list, logger=self.logger)
if flag <= 0:
self.logger.debug('no search result, nothing happen')
return self.build_out(flag, '')
return self.build_out(flag, 'search engine error or no result')
for item in search_result:
new_article_id = self.pb.add(collection_name='articles', body=item)

View File

@ -16,8 +16,9 @@ else:
urls = []
sp = ServiceProcesser()
sp(sites=urls)
'''
def task():
sp(sites=urls)
@ -28,3 +29,6 @@ schedule.every().day.at("01:17").do(task)
while True:
schedule.run_pending()
time.sleep(60)
site1 = https://www.welivesecurity.com/en/
site2 = https://www.scmagazine.com/
'''

View File

@ -1,7 +1,7 @@
# 使用aliyun dashscope的api封装
# 非流式接口
# 为了兼容性输入输出都使用message格式与openai SDK格式一致
import time
from http import HTTPStatus
import dashscope
import random
@ -62,7 +62,6 @@ def dashscope_llm(messages: list,
if __name__ == '__main__':
import time
from pprint import pprint
# logging.basicConfig(level=logging.DEBUG)

View File

@ -9,6 +9,7 @@ from bs4.element import Comment
from llms.dashscope_wrapper import dashscope_llm
from datetime import datetime, date
from requests.compat import urljoin
import chardet
header = {
@ -35,6 +36,9 @@ def text_from_soup(soup: BeautifulSoup) -> str:
def parse_html_content(out: str) -> dict:
# 发现llm出来的结果有时会在键值或者内容的引号外面出现\n \t安全起见全部去除反正后续分析时llm也不看内容的换行这些
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
result = pattern.findall(out)
out = result[0]
dict_str = out.strip("```").strip("python").strip("json").strip()
dict_str = dict_str.replace("\n", "").replace("\t", "")
# 先正则解析出{}中的内容
@ -46,19 +50,24 @@ def parse_html_content(out: str) -> dict:
if date_str:
dct['publish_time'] = date_str[0].replace("-", "")
else:
date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", dct['publish_time'])
if date_str:
dct['publish_time'] = date_str[0]
dct['publish_time'] = date_str[0].replace(".", "")
else:
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
if date_str:
dct['publish_time'] = date_str[0]
else:
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
return dct
sys_info = """给你一段从网页html文件中提取的所有文本请尝试输出其标题、摘要、内容和发布日期。
sys_info = '''你是一个html网页解析器你将接收一段用户从网页html文件中提取的文本请解析出其标题、摘要、内容和发布日期。
发布日期的格式为XXXX-XX-XX如果找不到则为空内容不要包含标题作者和发布日期
输出格式为Python字典key分别为titleabstractcontent和publish_time
如果你识别出给定文本大部分既不是中文也不是英文那么请输出无法解析
请务必按照Python字典的格式输出key和value使用双引号包裹key分别为titleabstractcontent和publish_time输出结果请整体用三引号包裹如下所示
"""
{"title": "解析出的标题", "abstract": "解析出的摘要", "content": "解析出的内容", "publish_time": "解析出的发布日期XXXX-XX-XX"}
"""'''
def llm_crawler(url: str | Path, logger=None) -> (int, dict):
@ -85,13 +94,38 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
else:
print(f"cannot connect {url}")
return -7, {}
# todo llm_crawler和simple_crawler这里都需要加个判断对于乱码和总字符量很少的网页需要排除掉
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
if encoding is not None and encoding.lower() == 'utf-8':
try:
text = rawdata.decode(encoding)
except:
if logger:
logger.error(f"{url} decode error, aborting")
else:
print(f"{url} decode error, aborting")
return 0, {}
else:
if logger:
logger.error(f"{url} undetected coding, aborting")
else:
print(f"{url} undetected coding, aborting")
return 0, {}
# 使用 BeautifulSoup 解析 HTML 内容
soup = BeautifulSoup(response.text, "html.parser")
soup = BeautifulSoup(text, "html.parser")
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
if logger:
logger.warning(f"{url} content too long for llm parsing")
else:
print(f"{url} content too long for llm parsing")
return 0, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403'):
if logger:
logger.warning(f"can not get {url} from the Internet")
@ -114,10 +148,14 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
print(msg)
return 0, {}
info["url"] = str(url)
if not info["title"] or not info["content"]:
if len(info['title']) < 5 or len(info['content']) < 24:
if logger:
logger.warning(f"{info} not valid")
else:
print(f"{info} not valid")
return 0, {}
info["url"] = str(url)
# 提取图片链接,提取不到就空着
image_links = []
images = soup.find_all("img")

View File

@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import re
import chardet
extractor = GeneralNewsExtractor()
@ -31,15 +32,39 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
print(f"cannot connect {url}")
return -7, {}
text = response.text
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
if encoding is not None and encoding.lower() == 'utf-8':
try:
text = rawdata.decode(encoding)
except:
if logger:
logger.error(f"{url} decode error, aborting")
else:
print(f"{url} decode error, aborting")
return 0, {}
else:
if logger:
logger.error(f"{url} undetected coding, aborting")
else:
print(f"{url} undetected coding, aborting")
return 0, {}
result = extractor.extract(text)
if not result or not result['title'] or not result['content']:
if not result:
if logger:
logger.error(f"gne cannot extract {url}")
else:
print(f"gne cannot extract {url}")
return 0, {}
if len(result['title']) < 5 or len(result['content']) < 24:
if logger:
logger.warning(f"{result} not valid")
else:
print(f"{result} not valid")
return 0, {}
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
if logger:
logger.warning(f"can not get {url} from the Internet")
@ -51,11 +76,15 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
if date_str:
result['publish_time'] = date_str[0].replace("-", "")
else:
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", result['publish_time'])
if date_str:
result['publish_time'] = date_str[0]
result['publish_time'] = date_str[0].replace(".", "")
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
if date_str:
result['publish_time'] = date_str[0]
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
soup = BeautifulSoup(text, "html.parser")
try:

View File

@ -76,8 +76,6 @@ class ServiceProcesser:
for value in new_articles:
if not value:
continue
if not value['title'] or not value['content']:
continue
from_site = urlparse(value['url']).netloc
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
@ -137,7 +135,10 @@ class ServiceProcesser:
self.logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
for key, value in cache.items():
if value['title']:
text_for_insight = text_translate([value['title']], logger=self.logger)
if is_chinese(value['title']):
text_for_insight = value['title']
else:
text_for_insight = text_translate([value['title']], logger=self.logger)
if text_for_insight:
insight_id = self.pb.add(collection_name='insights',
body={'content': text_for_insight[0], 'articles': [key]})

View File

@ -9,6 +9,6 @@ bad_sample = 黑客组织活动最近频发
report_type = 网络安全情报
[sites]
site1 = https://www.welivesecurity.com/en/
site2 = https://www.scmagazine.com/
site3 = https://business.sohu.com/
site3 = https://www.hackread.com/
site2 = http://sh.people.com.cn/
site1 = https://www.xuexi.cn/