mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
update manuel
This commit is contained in:
commit
cf709d7f05
BIN
asset/wiseflow_arch.png
Normal file
BIN
asset/wiseflow_arch.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 64 KiB |
@ -11,4 +11,4 @@ docker-compose.yaml
|
|||||||
Dockerfile
|
Dockerfile
|
||||||
README.md
|
README.md
|
||||||
backend/__pycache__
|
backend/__pycache__
|
||||||
backend/AWtest
|
backend/WStest
|
||||||
|
2
client/.gitignore
vendored
2
client/.gitignore
vendored
@ -1,4 +1,4 @@
|
|||||||
.env
|
.env
|
||||||
.venv/
|
.venv/
|
||||||
pb/pb_data/
|
pb/pb_data/
|
||||||
backend/AWtest/
|
backend/WStest/
|
@ -18,16 +18,21 @@
|
|||||||
- character 以什么身份挖掘线索(这决定了llm的关注点和立场)
|
- character 以什么身份挖掘线索(这决定了llm的关注点和立场)
|
||||||
- focus 关注什么方面的线索
|
- focus 关注什么方面的线索
|
||||||
- focus_type 线索类型
|
- focus_type 线索类型
|
||||||
- good_samples 你希望llm给出的线索描述模式(给两个sample)
|
- good_samples1 你希望llm给出的线索描述模式(给两个sample)
|
||||||
|
- good_samples2 你希望llm给出的线索描述模式(给两个sample)
|
||||||
- bad_samples 规避的线索描述模式
|
- bad_samples 规避的线索描述模式
|
||||||
- report_type 报告类型
|
- report_type 报告类型
|
||||||
|
|
||||||
- 【sites] 大类下面列出你的信源。一行一个网址。
|
### 4、编辑 sites.txt 文件
|
||||||
|
|
||||||
|
这个文件指定了需要本地执行的监控的信源,一行一个网址,支持随时更改,每次执行任务前会读取最新的。
|
||||||
|
|
||||||
如果你只爬取配置了专有爬虫的信源的话,可以直接编辑scrapers/__init__.py 中的scraper_map,这里都留空就好
|
如果你只爬取配置了专有爬虫的信源的话,可以直接编辑scrapers/__init__.py 中的scraper_map,这里都留空就好
|
||||||
|
|
||||||
专有爬虫的说明见 backend/scrapers/README.md
|
专有爬虫的说明见 backend/scrapers/README.md
|
||||||
|
|
||||||
|
**注:虽然wiseflow client配置了通用爬虫,对于新闻类静态网页有一定的爬取和解析效果,但我们还是强烈建议使用我们的数据订阅服务或者自写专业爬虫。**
|
||||||
|
|
||||||
## 参考:各服务注册地址
|
## 参考:各服务注册地址
|
||||||
|
|
||||||
- 阿里灵积大模型接口:https://dashscope.aliyun.com/
|
- 阿里灵积大模型接口:https://dashscope.aliyun.com/
|
||||||
|
@ -4,22 +4,13 @@
|
|||||||
import schedule
|
import schedule
|
||||||
import time
|
import time
|
||||||
from work_process import ServiceProcesser
|
from work_process import ServiceProcesser
|
||||||
import configparser
|
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
|
||||||
config.read('../config.ini')
|
|
||||||
|
|
||||||
if config.has_section('sites'):
|
|
||||||
web_pages = config['sites']
|
|
||||||
urls = [value for key, value in web_pages.items()]
|
|
||||||
else:
|
|
||||||
urls = []
|
|
||||||
|
|
||||||
sp = ServiceProcesser()
|
sp = ServiceProcesser()
|
||||||
sp(sites=urls)
|
|
||||||
|
|
||||||
'''
|
|
||||||
def task():
|
def task():
|
||||||
|
with open('../sites.txt', 'r', encoding='utf-8') as f:
|
||||||
|
urls = [line.strip() for line in f.readlines() if line.strip()]
|
||||||
sp(sites=urls)
|
sp(sites=urls)
|
||||||
|
|
||||||
|
|
||||||
@ -29,6 +20,3 @@ schedule.every().day.at("01:17").do(task)
|
|||||||
while True:
|
while True:
|
||||||
schedule.run_pending()
|
schedule.run_pending()
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
site1 = https://www.welivesecurity.com/en/
|
|
||||||
site2 = https://www.scmagazine.com/
|
|
||||||
'''
|
|
@ -125,6 +125,25 @@ def is_chinese(string):
|
|||||||
return (non_chinese_count/len(string)) < 0.68
|
return (non_chinese_count/len(string)) < 0.68
|
||||||
|
|
||||||
|
|
||||||
|
def extract_and_convert_dates(input_string):
|
||||||
|
# 定义匹配不同日期格式的正则表达式
|
||||||
|
patterns = [
|
||||||
|
r'(\d{4})-(\d{2})-(\d{2})', # 匹配YYYY-MM-DD格式
|
||||||
|
r'(\d{4})/(\d{2})/(\d{2})', # 匹配YYYY/MM/DD格式
|
||||||
|
r'(\d{4})\.(\d{2})\.(\d{2})', # 匹配YYYY.MM.DD格式
|
||||||
|
r'(\d{4})\\(\d{2})\\(\d{2})', # 匹配YYYY\MM\DD格式
|
||||||
|
r'(\d{4})(\d{2})(\d{2})' # 匹配YYYYMMDD格式
|
||||||
|
]
|
||||||
|
|
||||||
|
matches = []
|
||||||
|
for pattern in patterns:
|
||||||
|
matches = re.findall(pattern, input_string)
|
||||||
|
if matches:
|
||||||
|
break
|
||||||
|
if matches:
|
||||||
|
return ''.join(matches[0])
|
||||||
|
return None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# from InternLM/huixiangdou
|
# from InternLM/huixiangdou
|
||||||
# another awsome work
|
# another awsome work
|
||||||
|
@ -85,7 +85,7 @@ def search_insight(keyword: str, exist_urls: list[Union[str, Path]], knowledge:
|
|||||||
if url in exist_urls:
|
if url in exist_urls:
|
||||||
continue
|
continue
|
||||||
exist_urls.append(url)
|
exist_urls.append(url)
|
||||||
flag, value = simple_crawler(url, logger)
|
flag, value = simple_crawler(url)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
continue
|
continue
|
||||||
from_site = urlparse(url).netloc
|
from_site = urlparse(url).netloc
|
||||||
|
@ -15,7 +15,6 @@
|
|||||||
输入:
|
输入:
|
||||||
- expiration: datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
|
- expiration: datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
|
||||||
- existings:[str], 数据库已有文章的url列表,爬虫应该忽略这个列表里面的url
|
- existings:[str], 数据库已有文章的url列表,爬虫应该忽略这个列表里面的url
|
||||||
- logger:主流程的logger对象,如果爬虫需要单独logger,这个logger接收了可以不用
|
|
||||||
|
|
||||||
输出:
|
输出:
|
||||||
- [dict],返回结果列表,每个dict代表一个文章,格式如下:
|
- [dict],返回结果列表,每个dict代表一个文章,格式如下:
|
||||||
|
@ -2,18 +2,22 @@ from pathlib import Path
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import re
|
import re
|
||||||
from .simple_crawler import simple_crawler
|
from .simple_crawler import simple_crawler
|
||||||
import json
|
import httpx
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Comment
|
from bs4.element import Comment
|
||||||
from llms.dashscope_wrapper import dashscope_llm
|
from llms.dashscope_wrapper import dashscope_llm
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from requests.compat import urljoin
|
from requests.compat import urljoin
|
||||||
import chardet
|
import chardet
|
||||||
|
from general_utils import extract_and_convert_dates
|
||||||
|
from get_logger import get_logger
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
header = {
|
header = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||||
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
|
logger = get_logger(name='general_scraper', file=os.path.join(project_dir, f'general_scraper.log'))
|
||||||
|
|
||||||
|
|
||||||
def tag_visible(element: Comment) -> bool:
|
def tag_visible(element: Comment) -> bool:
|
||||||
@ -35,83 +39,56 @@ def text_from_soup(soup: BeautifulSoup) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def parse_html_content(out: str) -> dict:
|
def parse_html_content(out: str) -> dict:
|
||||||
# 发现llm出来的结果有时会在键值或者内容的引号外面出现\n \t安全起见全部去除,反正后续分析时llm也不看内容的换行这些
|
dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
|
||||||
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
||||||
result = pattern.findall(out)
|
result = pattern.findall(out)
|
||||||
out = result[0]
|
result = result[0].strip()
|
||||||
dict_str = out.strip("```").strip("python").strip("json").strip()
|
dict_strs = result.split('||')
|
||||||
dict_str = dict_str.replace("\n", "").replace("\t", "")
|
if not dict_strs:
|
||||||
# 先正则解析出{}中的内容
|
dict_strs = result.split('|||')
|
||||||
dict_str = re.findall(r'{(.*?)}', dict_str)
|
if not dict_strs:
|
||||||
# dict_str = dict_str[0].replace("'", '"') #会误伤
|
return dct
|
||||||
# json loads 要求双引号, 且需要把\n等转译
|
if len(dict_strs) == 3:
|
||||||
dct = json.loads('{' + dict_str[0] + '}')
|
dct['title'] = dict_strs[0].strip()
|
||||||
date_str = re.findall(r"\d{4}-\d{2}-\d{2}", dct['publish_time'])
|
dct['content'] = dict_strs[1].strip()
|
||||||
if date_str:
|
elif len(dict_strs) == 4:
|
||||||
dct['publish_time'] = date_str[0].replace("-", "")
|
dct['title'] = dict_strs[0].strip()
|
||||||
|
dct['content'] = dict_strs[2].strip()
|
||||||
|
dct['abstract'] = dict_strs[1].strip()
|
||||||
else:
|
else:
|
||||||
date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", dct['publish_time'])
|
return dct
|
||||||
|
date_str = extract_and_convert_dates(dict_strs[-1])
|
||||||
if date_str:
|
if date_str:
|
||||||
dct['publish_time'] = date_str[0].replace(".", "")
|
dct['publish_time'] = date_str
|
||||||
else:
|
|
||||||
date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
|
|
||||||
if date_str:
|
|
||||||
dct['publish_time'] = date_str[0]
|
|
||||||
else:
|
else:
|
||||||
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
||||||
return dct
|
return dct
|
||||||
|
|
||||||
|
|
||||||
sys_info = '''你是一个html网页解析器,你将接收一段用户从网页html文件中提取的文本,请解析出其标题、摘要、内容和发布日期。
|
# qwen1.5-72b解析json格式太容易出错,网页上的情况太多,比如经常直接使用英文的",这样后面json.loads就容易出错……
|
||||||
发布日期的格式为:XXXX-XX-XX,如果找不到则为空。内容不要包含标题、作者和发布日期。
|
sys_info = '''你是一个html网页解析器,你将接收一段用户从网页html文件中提取的文本,请解析出其标题、摘要、内容和发布日期,发布日期格式为YYYY-MM-DD。
|
||||||
请务必按照Python字典的格式输出,key和value使用双引号包裹,key分别为:title、abstract、content和publish_time。输出结果请整体用三引号包裹,如下所示:
|
结果请按照以下格式返回(整体用三引号包裹):
|
||||||
"""
|
"""
|
||||||
{"title": "解析出的标题", "abstract": "解析出的摘要", "content": "解析出的内容", "publish_time": "解析出的发布日期XXXX-XX-XX"}
|
标题||摘要||内容||发布日期XXXX-XX-XX
|
||||||
"""'''
|
"""
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
def llm_crawler(url: str | Path, logger=None) -> (int, dict):
|
def llm_crawler(url: str | Path) -> (int, dict):
|
||||||
"""
|
"""
|
||||||
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
||||||
参考:https://mp.weixin.qq.com/s/4J-kofsfFDiV1FxGlTJLfA
|
参考:https://mp.weixin.qq.com/s/4J-kofsfFDiV1FxGlTJLfA
|
||||||
测试URL:
|
|
||||||
url = "https://so.html5.qq.com/page/real/search_news?docid=70000021_40665eb6afe80152"
|
|
||||||
url = "https://mp.weixin.qq.com/s?src=11×tamp=1709999167&ver=5128&signature=e0Tssc4COc*p-RkKaPwUMrGePUxko8N621VxARnI8uKDg*l5C7Z8gBC6RDUAnyGqvmzJ5WEzvaO-T7GvMRw9LwNaJS3Hh2tyaITdmsaVtY9JsSmsidX6u4SqxirGsRdo&new=1"
|
|
||||||
"""
|
"""
|
||||||
# 发送 HTTP 请求获取网页内容
|
# 发送 HTTP 请求获取网页内容
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, timeout=60)
|
with httpx.Client() as client:
|
||||||
except:
|
response = client.get(url, headers=header, timeout=30)
|
||||||
if logger:
|
|
||||||
logger.error(f"cannot connect {url}")
|
|
||||||
else:
|
|
||||||
print(f"cannot connect {url}")
|
|
||||||
return -7, {}
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
if logger:
|
|
||||||
logger.error(f"cannot connect {url}")
|
|
||||||
else:
|
|
||||||
print(f"cannot connect {url}")
|
|
||||||
return -7, {}
|
|
||||||
|
|
||||||
rawdata = response.content
|
rawdata = response.content
|
||||||
encoding = chardet.detect(rawdata)['encoding']
|
encoding = chardet.detect(rawdata)['encoding']
|
||||||
if encoding is not None and encoding.lower() == 'utf-8':
|
|
||||||
try:
|
|
||||||
text = rawdata.decode(encoding)
|
text = rawdata.decode(encoding)
|
||||||
except:
|
except Exception as e:
|
||||||
if logger:
|
logger.error(e)
|
||||||
logger.error(f"{url} decode error, aborting")
|
return -7, {}
|
||||||
else:
|
|
||||||
print(f"{url} decode error, aborting")
|
|
||||||
return 0, {}
|
|
||||||
else:
|
|
||||||
if logger:
|
|
||||||
logger.error(f"{url} undetected coding, aborting")
|
|
||||||
else:
|
|
||||||
print(f"{url} undetected coding, aborting")
|
|
||||||
return 0, {}
|
|
||||||
|
|
||||||
# 使用 BeautifulSoup 解析 HTML 内容
|
# 使用 BeautifulSoup 解析 HTML 内容
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
@ -120,17 +97,12 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
|
|||||||
html_lines = [line.strip() for line in html_lines if line.strip()]
|
html_lines = [line.strip() for line in html_lines if line.strip()]
|
||||||
html_text = "\n".join(html_lines)
|
html_text = "\n".join(html_lines)
|
||||||
if len(html_text) > 29999:
|
if len(html_text) > 29999:
|
||||||
if logger:
|
|
||||||
logger.warning(f"{url} content too long for llm parsing")
|
logger.warning(f"{url} content too long for llm parsing")
|
||||||
else:
|
|
||||||
print(f"{url} content too long for llm parsing")
|
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403'):
|
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
|
||||||
if logger:
|
or html_text.startswith('出错了'):
|
||||||
logger.warning(f"can not get {url} from the Internet")
|
logger.warning(f"can not get {url} from the Internet")
|
||||||
else:
|
|
||||||
print(f"can not get {url} from the Internet")
|
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
@ -140,19 +112,13 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
|
|||||||
llm_output = dashscope_llm(messages, "qwen1.5-72b-chat", logger=logger)
|
llm_output = dashscope_llm(messages, "qwen1.5-72b-chat", logger=logger)
|
||||||
try:
|
try:
|
||||||
info = parse_html_content(llm_output)
|
info = parse_html_content(llm_output)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
msg = f"can not parse {llm_output}"
|
msg = f"can not parse {llm_output}"
|
||||||
if logger:
|
logger.debug(msg)
|
||||||
logger.warning(msg)
|
|
||||||
else:
|
|
||||||
print(msg)
|
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if len(info['title']) < 5 or len(info['content']) < 24:
|
if len(info['title']) < 4 or len(info['content']) < 24:
|
||||||
if logger:
|
logger.debug(f"{info} not valid")
|
||||||
logger.warning(f"{info} not valid")
|
|
||||||
else:
|
|
||||||
print(f"{info} not valid")
|
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
info["url"] = str(url)
|
info["url"] = str(url)
|
||||||
@ -185,21 +151,12 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
|
|||||||
return 11, info
|
return 11, info
|
||||||
|
|
||||||
|
|
||||||
def general_scraper(site: str, expiration: date, existing: list[str], logger=None) -> list[dict]:
|
def general_scraper(site: str, expiration: date, existing: list[str]) -> list[dict]:
|
||||||
try:
|
try:
|
||||||
response = requests.get(site, header, timeout=60)
|
with httpx.Client() as client:
|
||||||
except:
|
response = client.get(site, headers=header, timeout=30)
|
||||||
if logger:
|
except Exception as e:
|
||||||
logger.error(f"cannot connect {site}")
|
logger.error(e)
|
||||||
else:
|
|
||||||
print(f"cannot connect {site}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
if logger:
|
|
||||||
logger.error(f"cannot connect {site}")
|
|
||||||
else:
|
|
||||||
print(f"cannot connect {site}")
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
page_source = response.text
|
page_source = response.text
|
||||||
@ -209,25 +166,18 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger=Non
|
|||||||
base_url = parsed_url.scheme + '://' + parsed_url.netloc
|
base_url = parsed_url.scheme + '://' + parsed_url.netloc
|
||||||
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
|
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
|
||||||
if not urls:
|
if not urls:
|
||||||
if logger:
|
|
||||||
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
|
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
|
||||||
if site in existing:
|
if site in existing:
|
||||||
if logger:
|
logger.debug(f"{site} has been crawled before, skip it")
|
||||||
logger.warning(f"{site} has been crawled before, skip it")
|
|
||||||
else:
|
|
||||||
print(f"{site} has been crawled before, skip it")
|
|
||||||
return []
|
return []
|
||||||
flag, result = simple_crawler(site, logger=logger)
|
flag, result = simple_crawler(site)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
flag, result = llm_crawler(site, logger=logger)
|
flag, result = llm_crawler(site)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
return []
|
return []
|
||||||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||||||
if publish_date.date() < expiration:
|
if publish_date.date() < expiration:
|
||||||
if logger:
|
logger.debug(f"{site} is too old, skip it")
|
||||||
logger.warning(f"{site} is too old, skip it")
|
|
||||||
else:
|
|
||||||
print(f"{site} is too old, skip it")
|
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
return [result]
|
return [result]
|
||||||
@ -235,23 +185,17 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger=Non
|
|||||||
articles = []
|
articles = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if url in existing:
|
if url in existing:
|
||||||
if logger:
|
logger.debug(f"{url} has been crawled before, skip it")
|
||||||
logger.warning(f"{url} has been crawled before, skip it")
|
|
||||||
else:
|
|
||||||
print(f"{url} has been crawled before, skip it")
|
|
||||||
continue
|
continue
|
||||||
existing.append(url)
|
existing.append(url)
|
||||||
flag, result = simple_crawler(url, logger=logger)
|
flag, result = simple_crawler(url)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
flag, result = llm_crawler(url, logger=logger)
|
flag, result = llm_crawler(url)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
continue
|
continue
|
||||||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||||||
if publish_date.date() < expiration:
|
if publish_date.date() < expiration:
|
||||||
if logger:
|
logger.debug(f"{url} is too old, skip it")
|
||||||
logger.warning(f"{url} is too old, skip it")
|
|
||||||
else:
|
|
||||||
print(f"{url} is too old, skip it")
|
|
||||||
else:
|
else:
|
||||||
articles.append(result)
|
articles.append(result)
|
||||||
|
|
||||||
|
@ -1,88 +1,52 @@
|
|||||||
from gne import GeneralNewsExtractor
|
from gne import GeneralNewsExtractor
|
||||||
import requests
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
from general_utils import extract_and_convert_dates
|
||||||
import chardet
|
import chardet
|
||||||
|
from get_logger import get_logger
|
||||||
|
import os
|
||||||
|
|
||||||
extractor = GeneralNewsExtractor()
|
extractor = GeneralNewsExtractor()
|
||||||
header = {
|
header = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||||
|
|
||||||
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
|
logger = get_logger(name='simple_crawler', file=os.path.join(project_dir, f'simple_crawler.log'))
|
||||||
|
|
||||||
def simple_crawler(url: str | Path, logger=None) -> (int, dict):
|
|
||||||
|
def simple_crawler(url: str | Path) -> (int, dict):
|
||||||
"""
|
"""
|
||||||
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, header, timeout=60)
|
with httpx.Client() as client:
|
||||||
except:
|
response = client.get(url, headers=header, timeout=30)
|
||||||
if logger:
|
|
||||||
logger.error(f"cannot connect {url}")
|
|
||||||
else:
|
|
||||||
print(f"cannot connect {url}")
|
|
||||||
return -7, {}
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
if logger:
|
|
||||||
logger.error(f"cannot connect {url}")
|
|
||||||
else:
|
|
||||||
print(f"cannot connect {url}")
|
|
||||||
return -7, {}
|
|
||||||
|
|
||||||
rawdata = response.content
|
rawdata = response.content
|
||||||
encoding = chardet.detect(rawdata)['encoding']
|
encoding = chardet.detect(rawdata)['encoding']
|
||||||
if encoding is not None and encoding.lower() == 'utf-8':
|
|
||||||
try:
|
|
||||||
text = rawdata.decode(encoding)
|
text = rawdata.decode(encoding)
|
||||||
except:
|
except Exception as e:
|
||||||
if logger:
|
logger.error(e)
|
||||||
logger.error(f"{url} decode error, aborting")
|
return -7, {}
|
||||||
else:
|
|
||||||
print(f"{url} decode error, aborting")
|
|
||||||
return 0, {}
|
|
||||||
else:
|
|
||||||
if logger:
|
|
||||||
logger.error(f"{url} undetected coding, aborting")
|
|
||||||
else:
|
|
||||||
print(f"{url} undetected coding, aborting")
|
|
||||||
return 0, {}
|
|
||||||
|
|
||||||
result = extractor.extract(text)
|
result = extractor.extract(text)
|
||||||
if not result:
|
if not result:
|
||||||
if logger:
|
|
||||||
logger.error(f"gne cannot extract {url}")
|
logger.error(f"gne cannot extract {url}")
|
||||||
else:
|
|
||||||
print(f"gne cannot extract {url}")
|
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if len(result['title']) < 5 or len(result['content']) < 24:
|
if len(result['title']) < 4 or len(result['content']) < 24:
|
||||||
if logger:
|
logger.info(f"{result} not valid")
|
||||||
logger.warning(f"{result} not valid")
|
|
||||||
else:
|
|
||||||
print(f"{result} not valid")
|
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
|
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
|
||||||
if logger:
|
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
|
||||||
logger.warning(f"can not get {url} from the Internet")
|
logger.warning(f"can not get {url} from the Internet")
|
||||||
else:
|
|
||||||
print(f"can not get {url} from the Internet")
|
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
date_str = re.findall(r"\d{4}-\d{2}-\d{2}", result['publish_time'])
|
date_str = extract_and_convert_dates(result['publish_time'])
|
||||||
if date_str:
|
if date_str:
|
||||||
result['publish_time'] = date_str[0].replace("-", "")
|
result['publish_time'] = date_str
|
||||||
else:
|
|
||||||
date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", result['publish_time'])
|
|
||||||
if date_str:
|
|
||||||
result['publish_time'] = date_str[0].replace(".", "")
|
|
||||||
else:
|
|
||||||
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
|
|
||||||
if date_str:
|
|
||||||
result['publish_time'] = date_str[0]
|
|
||||||
else:
|
else:
|
||||||
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
||||||
|
|
||||||
@ -93,7 +57,7 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
|
|||||||
result['abstract'] = meta_description["content"]
|
result['abstract'] = meta_description["content"]
|
||||||
else:
|
else:
|
||||||
result['abstract'] = ''
|
result['abstract'] = ''
|
||||||
except:
|
except Exception:
|
||||||
result['abstract'] = ''
|
result['abstract'] = ''
|
||||||
|
|
||||||
result['url'] = str(url)
|
result['url'] = str(url)
|
||||||
|
@ -59,13 +59,13 @@ class ServiceProcesser:
|
|||||||
# 定义扫描源列表,如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况,这时应该使用通用爬虫
|
# 定义扫描源列表,如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况,这时应该使用通用爬虫
|
||||||
sources = sites if sites else list(scraper_map.keys())
|
sources = sites if sites else list(scraper_map.keys())
|
||||||
new_articles = []
|
new_articles = []
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
futures = []
|
futures = []
|
||||||
for site in sources:
|
for site in sources:
|
||||||
if site in scraper_map:
|
if site in scraper_map:
|
||||||
futures.append(executor.submit(scraper_map[site], expiration, existings, self.logger))
|
futures.append(executor.submit(scraper_map[site], expiration, existings))
|
||||||
else:
|
else:
|
||||||
futures.append(executor.submit(general_scraper, site, expiration, existings, self.logger))
|
futures.append(executor.submit(general_scraper, site, expiration, existings))
|
||||||
concurrent.futures.wait(futures)
|
concurrent.futures.wait(futures)
|
||||||
for future in futures:
|
for future in futures:
|
||||||
try:
|
try:
|
||||||
|
@ -7,8 +7,3 @@ good_sample1 = 黑客组织Rhysida声称已入侵中国国有能源公司
|
|||||||
good_sample2 = 差不多一百万份包含未成年人数据(包括家庭地址和照片)的文件对互联网上的任何人都开放,对孩子构成威胁
|
good_sample2 = 差不多一百万份包含未成年人数据(包括家庭地址和照片)的文件对互联网上的任何人都开放,对孩子构成威胁
|
||||||
bad_sample = 黑客组织活动最近频发
|
bad_sample = 黑客组织活动最近频发
|
||||||
report_type = 网络安全情报
|
report_type = 网络安全情报
|
||||||
|
|
||||||
[sites]
|
|
||||||
site3 = https://www.hackread.com/
|
|
||||||
site2 = http://sh.people.com.cn/
|
|
||||||
site1 = https://www.xuexi.cn/
|
|
5
client/sites.txt
Normal file
5
client/sites.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
https://www.hackread.com/
|
||||||
|
http://sh.people.com.cn/
|
||||||
|
https://www.xuexi.cn/
|
||||||
|
https://www.defensenews.com/
|
||||||
|
https://www.meritalk.com
|
@ -1 +1 @@
|
|||||||
v0.2.0
|
v0.2.1
|
||||||
|
Loading…
Reference in New Issue
Block a user