mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
web dashboard
This commit is contained in:
parent
6805be7d14
commit
fd633ae54d
4
.gitignore
vendored
4
.gitignore
vendored
@ -4,3 +4,7 @@
|
||||
.DS_Store
|
||||
.idea/
|
||||
__pycache__
|
||||
.env
|
||||
.venv/
|
||||
core/pb/pb_data/
|
||||
core/WStest/
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 64 KiB |
4
client/.gitignore
vendored
4
client/.gitignore
vendored
@ -1,4 +0,0 @@
|
||||
.env
|
||||
.venv/
|
||||
pb/pb_data/
|
||||
backend/WStest/
|
@ -1,19 +0,0 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -yq tzdata build-essential
|
||||
|
||||
RUN ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY backend/requirements.txt requirements.txt
|
||||
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
COPY backend .
|
||||
|
||||
EXPOSE 7777
|
||||
|
||||
CMD tail -f /dev/null
|
||||
|
||||
# ENTRYPOINT ["bash", "docker_entrypoint.sh"]
|
@ -1,35 +0,0 @@
|
||||
FROM node:20-slim as builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY web ./
|
||||
RUN npm install -g pnpm
|
||||
RUN pnpm install
|
||||
RUN pnpm build
|
||||
|
||||
|
||||
FROM alpine:latest
|
||||
|
||||
ARG PB_VERSION=0.21.1
|
||||
|
||||
RUN apk add --no-cache unzip ca-certificates tzdata && \
|
||||
ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||
|
||||
|
||||
# download and unzip PocketBase
|
||||
ADD https://github.com/pocketbase/pocketbase/releases/download/v${PB_VERSION}/pocketbase_${PB_VERSION}_linux_amd64.zip /tmp/pb.zip
|
||||
RUN unzip /tmp/pb.zip -d /pb/
|
||||
|
||||
RUN mkdir -p /pb
|
||||
|
||||
COPY ./pb/pb_migrations /pb/pb_migrations
|
||||
COPY ./pb/pb_hooks /pb/pb_hooks
|
||||
COPY --from=builder /app/dist /pb/pb_public
|
||||
|
||||
WORKDIR /pb
|
||||
|
||||
EXPOSE 8090
|
||||
|
||||
CMD tail -f /dev/null
|
||||
|
||||
# CMD ["/pb/pocketbase", "serve", "--http=0.0.0.0:8090"]
|
@ -1,86 +0,0 @@
|
||||
# WiseFlow Client Backend
|
||||
|
||||
# for developers
|
||||
|
||||
## 部署
|
||||
|
||||
1、建议创建新环境, **python版本为3.10**
|
||||
|
||||
2、 安装requirements.txt
|
||||
|
||||
## 单独启动数据库(需要先下载对应平台的pocketbase,或者单独build web docker并启动)
|
||||
|
||||
pocketbase [下载地址](https://pocketbase.io/docs/)
|
||||
|
||||
文件放入 backend/pb 目录下
|
||||
|
||||
```bash
|
||||
chmod +x pocketbase
|
||||
./pocketbase serve
|
||||
```
|
||||
|
||||
之后将pb的服务地址配置为环境变量PB_API_BASE
|
||||
|
||||
(如果选择使用docker,可以参考client文件夹下的docker files)
|
||||
|
||||
注:pb目录下的pb_migrations文件夹保持与repo同步,数据库会自动创建本项目需要的表单,如果不一致,可能导致后面运行失败
|
||||
|
||||
pb_data是数据库数据存放目录,如果更改了admin的密码,记得修改.env
|
||||
|
||||
## 脚本文件说明
|
||||
|
||||
- tasks.sh #这是启动定时任务的脚本 (本地纯调试后端,这个不启动也行)
|
||||
- backend.sh #这是启动后端服务的脚本,(默认使用 localhost:7777, 通过 http://localhost:7777/docs/ 查看接口详情)
|
||||
|
||||
备注:backend 服务返回格式统一为 dict,`{"flag": int, "result": [{"type": "text", "content": "xxx"}]}`
|
||||
|
||||
统一返回flag约定
|
||||
|
||||
| flag 码 | 内容 |
|
||||
|--------|-----------------|
|
||||
| -11 | LLM 错误/异常 |
|
||||
| -7 | 网络请求失败 |
|
||||
| -6 | 翻译接口失败 |
|
||||
| -5 | 入参格式错误 |
|
||||
| -4 | 向量模型错误 |
|
||||
| -3 | (预留) |
|
||||
| -2 | pb数据库接口失败 |
|
||||
| -1 | 未知错误 |
|
||||
| 0 | 正常返回 |
|
||||
| 1 | (预留) |
|
||||
| 2 | (预留) |
|
||||
| 3 | (预留) |
|
||||
| 11 | 用户所处流程正常结束 |
|
||||
| 21 | 生成了新的文件 |
|
||||
|
||||
注: 1、提交后端request status 200 只意味着提交成功,不表示后端完全处理成功,**收到flag 11才表示流程正常结束**,所有运算成功。
|
||||
|
||||
2、flag 0 通常意味着虽然所有运算都执行了,但没有任何结果,即没有新文件产生,也没有新的数据提交给数据库。
|
||||
|
||||
3、另外对于translation接口,由于是批量处理,存在着部分成功(翻译结果提交数据库并做了关联),部分失败的情况,所以即便是没有收到flag11,也建议重新从pb读一遍数据
|
||||
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
backend
|
||||
├── llms # 大模型的wraper
|
||||
├── scrapers # 爬虫库
|
||||
|—— __init__.py #如果要添加具体网站的专有爬虫,需要把爬虫脚本放在这个文件的同级目录,同时编辑这面的scraper_map字典
|
||||
|—— general_scraper.py #通用网页解析器
|
||||
|—— simple_crawler.py #基于gne的快速单一页面解析器
|
||||
|—— __init__.py # backend主函数
|
||||
├── background_task.py # 后台任务主程序,如果要定义多个后台任务,请编辑这个文件
|
||||
├── main.py # 后端服务主程序(fastapi框架)
|
||||
├── tasks.sh # 后台服务启动脚本
|
||||
|—— backend.sh # 后端服务启动脚本`
|
||||
├── embedding.py # embedding模型服务
|
||||
├── pb_api.py # pb数据库接口
|
||||
├── general_utils.py # 工具库
|
||||
├── get_insight.py # 线索分析与提炼模块
|
||||
├── get_logger.py # logger配置
|
||||
├── get_report.py # 报告生成模块
|
||||
├── get_search.py # 基于sogu的搜索实现
|
||||
├── work_process.py # 后台服务主流程(抓取与提炼)
|
||||
├── tranlsation_volcengine.py # 基于火山引擎api的翻译模块
|
||||
```
|
@ -1,41 +0,0 @@
|
||||
"""
|
||||
通过编辑这个脚本,可以自定义需要的后台任务
|
||||
"""
|
||||
import schedule
|
||||
import time
|
||||
from get_insight import pb, logger
|
||||
from work_process import ServiceProcesser
|
||||
|
||||
|
||||
sp = ServiceProcesser()
|
||||
counter = 0
|
||||
|
||||
|
||||
# 每小时唤醒一次,如果pb的sites表中有信源,会挑取符合周期的信源执行,没有没有的话,则每24小时执行专有爬虫一次
|
||||
def task():
|
||||
global counter
|
||||
sites = pb.read('sites', filter='activated=True')
|
||||
urls = []
|
||||
for site in sites:
|
||||
if not site['per_hours'] or not site['url']:
|
||||
continue
|
||||
if counter % site['per_hours'] == 0:
|
||||
urls.append(site['url'])
|
||||
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
|
||||
logger.info(urls)
|
||||
if urls:
|
||||
sp(sites=urls)
|
||||
else:
|
||||
if counter % 24 == 0:
|
||||
sp()
|
||||
else:
|
||||
print('\033[0;33mno work for this loop\033[0m')
|
||||
counter += 1
|
||||
|
||||
|
||||
schedule.every().hour.at(":38").do(task)
|
||||
|
||||
task()
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60)
|
@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -o allexport
|
||||
set +o allexport
|
||||
exec uvicorn main:app --reload --host 0.0.0.0 --port 7777 &
|
||||
exec python background_task.py
|
@ -1,17 +0,0 @@
|
||||
from BCEmbedding.tools.langchain import BCERerank
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
import os
|
||||
|
||||
|
||||
embedding_model_name = os.environ.get('EMBEDDING_MODEL_PATH', "")
|
||||
rerank_model_name = os.environ.get('RERANKER_MODEL_PATH', "")
|
||||
|
||||
if not embedding_model_name or not rerank_model_name:
|
||||
raise Exception("请设置 EMBEDDING_MODEL_PATH 和 RERANKER_MODEL_PATH")
|
||||
|
||||
device = os.environ.get('DEVICE', 'cpu')
|
||||
embedding_model_kwargs = {'device': device}
|
||||
embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True, 'show_progress_bar': False}
|
||||
reranker_args = {'model': rerank_model_name, 'top_n': 5, 'device': device}
|
||||
embed_model = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=embedding_model_kwargs, encode_kwargs=embedding_encode_kwargs)
|
||||
reranker = BCERerank(**reranker_args)
|
@ -1,286 +0,0 @@
|
||||
from embeddings import embed_model, reranker
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.vectorstores.utils import DistanceStrategy
|
||||
from langchain.retrievers import ContextualCompressionRetriever
|
||||
from llms.dashscope_wrapper import dashscope_llm
|
||||
from general_utils import isChinesePunctuation, is_chinese
|
||||
from tranlsation_volcengine import text_translate
|
||||
import time
|
||||
import re
|
||||
import os
|
||||
from general_utils import get_logger_level
|
||||
from loguru import logger
|
||||
from pb_api import PbTalker
|
||||
|
||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
os.makedirs(project_dir, exist_ok=True)
|
||||
logger_file = os.path.join(project_dir, 'scanning_task.log')
|
||||
dsw_log = get_logger_level()
|
||||
|
||||
logger.add(
|
||||
logger_file,
|
||||
level=dsw_log,
|
||||
backtrace=True,
|
||||
diagnose=True,
|
||||
rotation="50 MB"
|
||||
)
|
||||
pb = PbTalker(logger)
|
||||
|
||||
|
||||
max_tokens = 4000
|
||||
relation_theshold = 0.525
|
||||
|
||||
role_config = pb.read(collection_name='roleplays', filter=f'activated=True')
|
||||
_role_config_id = ''
|
||||
if role_config:
|
||||
character = role_config[0]['character']
|
||||
focus = role_config[0]['focus']
|
||||
focus_type = role_config[0]['focus_type']
|
||||
good_sample1 = role_config[0]['good_sample1']
|
||||
good_sample2 = role_config[0]['good_sample2']
|
||||
bad_sample = role_config[0]['bad_sample']
|
||||
_role_config_id = role_config[0]['id']
|
||||
else:
|
||||
character, good_sample1, focus, focus_type, good_sample2, bad_sample = '', '', '', '', '', ''
|
||||
|
||||
if not character:
|
||||
character = input('\033[0;32m 请为首席情报官指定角色设定(eg. 来自中国的网络安全情报专家):\033[0m\n')
|
||||
_role_config_id = pb.add(collection_name='roleplays', body={'character': character, 'activated': True})
|
||||
|
||||
if not _role_config_id:
|
||||
raise Exception('pls check pb data, 无法获取角色设定')
|
||||
|
||||
if not (focus and focus_type and good_sample1 and good_sample2 and bad_sample):
|
||||
focus = input('\033[0;32m 请为首席情报官指定关注点(eg. 中国关注的网络安全新闻):\033[0m\n')
|
||||
focus_type = input('\033[0;32m 请为首席情报官指定关注点类型(eg. 网络安全新闻):\033[0m\n')
|
||||
good_sample1 = input('\033[0;32m 请给出一个你期望的情报描述示例(eg. 黑客组织Rhysida声称已入侵中国国有能源公司): \033[0m\n')
|
||||
good_sample2 = input('\033[0;32m 请再给出一个理想示例(eg. 差不多一百万份包含未成年人数据(包括家庭地址和照片)的文件对互联网上的任何人都开放,对孩子构成威胁): \033[0m\n')
|
||||
bad_sample = input('\033[0;32m 请给出一个你不期望的情报描述示例(eg. 黑客组织活动最近频发): \033[0m\n')
|
||||
_ = pb.update(collection_name='roleplays', id=_role_config_id, body={'focus': focus, 'focus_type': focus_type, 'good_sample1': good_sample1, 'good_sample2': good_sample2, 'bad_sample': bad_sample})
|
||||
|
||||
# 实践证明,如果强调让llm挖掘我国值得关注的线索,则挖掘效果不好(容易被新闻内容误导,错把别的国家当成我国,可能这时新闻内有我国这样的表述)
|
||||
# step by step 如果是内心独白方式,输出格式包含两种,难度增加了,qwen-max不能很好的适应,也许可以改成两步,第一步先输出线索列表,第二步再会去找对应的新闻编号
|
||||
# 但从实践来看,这样做的性价比并不高,且会引入新的不确定性。
|
||||
_first_stage_prompt = f'''你是一名{character},你将被给到一个新闻列表,新闻文章用XML标签分隔。请对此进行分析,挖掘出特别值得{focus}线索。你给出的线索应该足够具体,而不是同类型新闻的归类描述,好的例子如:
|
||||
"""{good_sample1}"""
|
||||
不好的例子如:
|
||||
"""{bad_sample}"""
|
||||
|
||||
请从头到尾仔细阅读每一条新闻的内容,不要遗漏,然后列出值得关注的线索,每条线索都用一句话进行描述,最终按一条一行的格式输出,并整体用三引号包裹,如下所示:
|
||||
"""
|
||||
{good_sample1}
|
||||
{good_sample2}
|
||||
"""
|
||||
|
||||
不管新闻列表是何种语言,请仅用中文输出分析结果。'''
|
||||
|
||||
_rewrite_insight_prompt = f'''你是一名{character},你将被给到一个新闻列表,新闻文章用XML标签分隔。请对此进行分析,从中挖掘出一条最值得关注的{focus_type}线索。你给出的线索应该足够具体,而不是同类型新闻的归类描述,好的例子如:
|
||||
"""{good_sample1}"""
|
||||
不好的例子如:
|
||||
"""{bad_sample}"""
|
||||
|
||||
请保证只输出一条最值得关注的线索,线索请用一句话描述,并用三引号包裹输出,如下所示:
|
||||
"""{good_sample1}"""
|
||||
|
||||
不管新闻列表是何种语言,请仅用中文输出分析结果。'''
|
||||
|
||||
|
||||
def _parse_insight(article_text: str, cache: dict) -> (bool, dict):
|
||||
input_length = len(cache)
|
||||
result = dashscope_llm([{'role': 'system', 'content': _first_stage_prompt}, {'role': 'user', 'content': article_text}],
|
||||
'qwen1.5-72b-chat', logger=logger)
|
||||
if result:
|
||||
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
||||
result = pattern.findall(result)
|
||||
else:
|
||||
logger.warning('1st-stage llm generate failed: no result')
|
||||
|
||||
if result:
|
||||
try:
|
||||
results = result[0].split('\n')
|
||||
results = [_.strip() for _ in results if _.strip()]
|
||||
to_del = []
|
||||
to_add = []
|
||||
for element in results:
|
||||
if ";" in element:
|
||||
to_del.append(element)
|
||||
to_add.extend(element.split(';'))
|
||||
for element in to_del:
|
||||
results.remove(element)
|
||||
results.extend(to_add)
|
||||
results = list(set(results))
|
||||
for text in results:
|
||||
logger.debug(f'parse result: {text}')
|
||||
# qwen-72b-chat 特例
|
||||
# potential_insight = re.sub(r'编号[^:]*:', '', text)
|
||||
potential_insight = text.strip()
|
||||
if len(potential_insight) < 2:
|
||||
logger.debug(f'parse failed: not enough potential_insight: {potential_insight}')
|
||||
continue
|
||||
if isChinesePunctuation(potential_insight[-1]):
|
||||
potential_insight = potential_insight[:-1]
|
||||
if potential_insight in cache:
|
||||
continue
|
||||
else:
|
||||
cache[potential_insight] = []
|
||||
except Exception as e:
|
||||
logger.debug(f'parse failed: {e}')
|
||||
|
||||
output_length = len(cache)
|
||||
if input_length == output_length:
|
||||
return True, cache
|
||||
return False, cache
|
||||
|
||||
|
||||
def _rewrite_insight(context: str) -> (bool, str):
|
||||
result = dashscope_llm([{'role': 'system', 'content': _rewrite_insight_prompt}, {'role': 'user', 'content': context}],
|
||||
'qwen1.5-72b-chat', logger=logger)
|
||||
if result:
|
||||
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
||||
result = pattern.findall(result)
|
||||
else:
|
||||
logger.warning(f'insight rewrite process llm generate failed: no result')
|
||||
|
||||
if not result:
|
||||
return True, ''
|
||||
try:
|
||||
results = result[0].split('\n')
|
||||
text = results[0].strip()
|
||||
logger.debug(f'parse result: {text}')
|
||||
if len(text) < 2:
|
||||
logger.debug(f'parse failed: not enough potential_insight: {text}')
|
||||
return True, ''
|
||||
if isChinesePunctuation(text[-1]):
|
||||
text = text[:-1]
|
||||
except Exception as e:
|
||||
logger.debug(f'parse failed: {e}')
|
||||
return True, ''
|
||||
return False, text
|
||||
|
||||
|
||||
def get_insight(articles: dict, titles: dict) -> list:
|
||||
context = ''
|
||||
cache = {}
|
||||
for value in articles.values():
|
||||
if value['abstract']:
|
||||
text = value['abstract']
|
||||
else:
|
||||
if value['title']:
|
||||
text = value['title']
|
||||
else:
|
||||
if value['content']:
|
||||
text = value['content']
|
||||
else:
|
||||
continue
|
||||
# 这里不使用long context是因为阿里灵积经常检查出输入敏感词,但又不给敏感词反馈,对应批次只能放弃,用long context风险太大
|
||||
# 另外long context中间部分llm可能会遗漏
|
||||
context += f"<article>{text}</article>\n"
|
||||
if len(context) < max_tokens:
|
||||
continue
|
||||
|
||||
flag, cache = _parse_insight(context, cache)
|
||||
if flag:
|
||||
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
||||
|
||||
context = ''
|
||||
# 据说频繁调用会引发性能下降,每次调用后休息1s。现在轮替调用qwen-72b和max,所以不必了。
|
||||
time.sleep(1)
|
||||
if context:
|
||||
flag, cache = _parse_insight(context, cache)
|
||||
if flag:
|
||||
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
||||
|
||||
if not cache:
|
||||
logger.warning('no insights found')
|
||||
return []
|
||||
|
||||
# second stage: 匹配insights和article_titles
|
||||
title_list = [Document(page_content=key, metadata={}) for key, value in titles.items()]
|
||||
retriever = FAISS.from_documents(title_list, embed_model,
|
||||
distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT).as_retriever(search_type="similarity",
|
||||
search_kwargs={"score_threshold": relation_theshold, "k": 10})
|
||||
compression = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
|
||||
|
||||
for key in cache.keys():
|
||||
logger.debug(f'searching related articles for insight: {key}')
|
||||
rerank_results = compression.get_relevant_documents(key)
|
||||
for i in range(len(rerank_results)):
|
||||
if rerank_results[i].metadata['relevance_score'] < relation_theshold:
|
||||
break
|
||||
cache[key].append(titles[rerank_results[i].page_content])
|
||||
if titles[rerank_results[i].page_content] not in articles:
|
||||
articles[titles[rerank_results[i].page_content]] = {'title': rerank_results[i].page_content}
|
||||
logger.info(f'{key} - {cache[key]}')
|
||||
|
||||
# third stage:对于对应文章重叠率超过25%的合并,然后对于有多个文章的,再次使用llm生成insight
|
||||
# 因为实践中发现,第一次insight召回的文章标题可能都很相关,但是汇总起来却指向另一个角度的insight
|
||||
def calculate_overlap(list1, list2):
|
||||
# 计算两个列表的交集长度
|
||||
intersection_length = len(set(list1).intersection(set(list2)))
|
||||
# 计算重合率
|
||||
overlap_rate = intersection_length / min(len(list1), len(list2))
|
||||
return overlap_rate >= 0.75
|
||||
|
||||
merged_dict = {}
|
||||
for key, value in cache.items():
|
||||
if not value:
|
||||
continue
|
||||
merged = False
|
||||
for existing_key, existing_value in merged_dict.items():
|
||||
if calculate_overlap(value, existing_value):
|
||||
merged_dict[existing_key].extend(value)
|
||||
merged = True
|
||||
break
|
||||
if not merged:
|
||||
merged_dict[key] = value
|
||||
|
||||
cache = {}
|
||||
for key, value in merged_dict.items():
|
||||
value = list(set(value))
|
||||
if len(value) > 1:
|
||||
context = ''
|
||||
for _id in value:
|
||||
context += f"<article>{articles[_id]['title']}</article>\n"
|
||||
if len(context) >= max_tokens:
|
||||
break
|
||||
if not context:
|
||||
continue
|
||||
|
||||
flag, new_insight = _rewrite_insight(context)
|
||||
if flag:
|
||||
logger.warning(f'insight {key} may contain wrong')
|
||||
cache[key] = value
|
||||
else:
|
||||
if cache:
|
||||
title_list = [Document(page_content=key, metadata={}) for key, value in cache.items()]
|
||||
retriever = FAISS.from_documents(title_list, embed_model,
|
||||
distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT).as_retriever(
|
||||
search_type="similarity",
|
||||
search_kwargs={"score_threshold": 0.85, "k": 1})
|
||||
compression = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
|
||||
rerank_results = compression.get_relevant_documents(new_insight)
|
||||
if rerank_results and rerank_results[0].metadata['relevance_score'] > 0.85:
|
||||
logger.debug(f"{new_insight} is too similar to {rerank_results[0].page_content}, merging")
|
||||
cache[rerank_results[0].page_content].extend(value)
|
||||
cache[rerank_results[0].page_content] = list(set(cache[rerank_results[0].page_content]))
|
||||
else:
|
||||
cache[new_insight] = value
|
||||
else:
|
||||
cache[new_insight] = value
|
||||
else:
|
||||
cache[key] = value
|
||||
|
||||
# 排序,对应articles越多的越靠前
|
||||
# sorted_cache = sorted(cache.items(), key=lambda x: len(x[1]), reverse=True)
|
||||
logger.info('re-ranking ressult:')
|
||||
new_cache = []
|
||||
for key, value in cache.items():
|
||||
if not is_chinese(key):
|
||||
translate_text = text_translate([key], target_language='zh', logger=logger)
|
||||
if translate_text:
|
||||
key = translate_text[0]
|
||||
logger.info(f'{key} - {value}')
|
||||
new_cache.append({'content': key, 'articles': value})
|
||||
|
||||
return new_cache
|
@ -1,93 +0,0 @@
|
||||
# 使用aliyun dashscope的api封装
|
||||
# 非流式接口
|
||||
# 为了兼容性,输入输出都使用message格式(与openai SDK格式一致)
|
||||
import time
|
||||
from http import HTTPStatus
|
||||
import dashscope
|
||||
import random
|
||||
import os
|
||||
|
||||
|
||||
DASHSCOPE_KEY = os.getenv("LLM_API_KEY")
|
||||
if not DASHSCOPE_KEY:
|
||||
raise ValueError("请指定LLM_API_KEY的环境变量")
|
||||
dashscope.api_key = DASHSCOPE_KEY
|
||||
|
||||
|
||||
def dashscope_llm(messages: list, model: str, logger=None, **kwargs) -> str:
|
||||
|
||||
if logger:
|
||||
logger.debug(f'messages:\n {messages}')
|
||||
logger.debug(f'model: {model}')
|
||||
logger.debug(f'kwargs:\n {kwargs}')
|
||||
|
||||
response = dashscope.Generation.call(
|
||||
messages=messages,
|
||||
model=model,
|
||||
result_format='message', # set the result to be "message" format.
|
||||
**kwargs
|
||||
)
|
||||
|
||||
for i in range(2):
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
break
|
||||
if response.message == "Input data may contain inappropriate content.":
|
||||
break
|
||||
|
||||
if logger:
|
||||
logger.warning(f"request failed. code: {response.code}, message:{response.message}\nretrying...")
|
||||
else:
|
||||
print(f"request failed. code: {response.code}, message:{response.message}\nretrying...")
|
||||
|
||||
time.sleep(1 + i*30)
|
||||
kwargs['seed'] = random.randint(1, 10000)
|
||||
response = dashscope.Generation.call(
|
||||
messages=messages,
|
||||
model=model,
|
||||
result_format='message', # set the result to be "message" format.
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if response.status_code != HTTPStatus.OK:
|
||||
if logger:
|
||||
logger.warning(f"request failed. code: {response.code}, message:{response.message}\nabort after multiple retries...")
|
||||
else:
|
||||
print(f"request failed. code: {response.code}, message:{response.message}\naborted after multiple retries...")
|
||||
return ''
|
||||
|
||||
if logger:
|
||||
logger.debug(f'result:\n {response.output.choices[0]}')
|
||||
logger.debug(f'usage:\n {response.usage}')
|
||||
|
||||
return response.output.choices[0]['message']['content']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from pprint import pprint
|
||||
|
||||
# logging.basicConfig(level=logging.DEBUG)
|
||||
system_content = ''
|
||||
user_content = '''你是一名优秀的翻译,请帮我把如下新闻标题逐条(一行为一条)翻译为中文,你的输出也必须为一条一行的格式。
|
||||
|
||||
The New York Times reported on 2021-01-01 that the COVID-19 cases in China are increasing.
|
||||
Cyber ops linked to Israel-Hamas conflict largely improvised, researchers say
|
||||
Russian hackers disrupted Ukrainian electrical grid last year
|
||||
Reform bill would overhaul controversial surveillance law
|
||||
GitHub disables pro-Russian hacktivist DDoS pages
|
||||
Notorious Russian hacking group appears to resurface with fresh cyberattacks on Ukraine
|
||||
Russian hackers attempted to breach petroleum refining company in NATO country, researchers say
|
||||
As agencies move towards multi-cloud networks, proactive security is key
|
||||
Keeping a competitive edge in the cybersecurity ‘game’
|
||||
Mud, sweat and data: The hard work of democratizing data at scale
|
||||
SEC sues SolarWinds and CISO for fraud
|
||||
Cyber workforce demand is outpacing supply, survey finds
|
||||
Four dozen countries declare they won
|
||||
White House executive order on AI seeks to address security risks
|
||||
malware resembling NSA code
|
||||
CISA budget cuts would be
|
||||
Hackers that breached Las Vegas casinos rely on violent threats, research shows'''
|
||||
|
||||
data = [{'role': 'user', 'content': user_content}]
|
||||
start_time = time.time()
|
||||
pprint(dashscope_llm(data, 'qwen-72b-chat'))
|
||||
print(f'time cost: {time.time() - start_time}')
|
@ -1,79 +0,0 @@
|
||||
# 使用lmdepoly_wrapper的api封装
|
||||
# 非流式接口
|
||||
# 为了兼容性,输入输出都使用message格式(与openai SDK格式一致)
|
||||
from lagent.llms.meta_template import INTERNLM2_META as META
|
||||
from lagent.llms.lmdepoly_wrapper import LMDeployClient
|
||||
from requests import ConnectionError
|
||||
import os
|
||||
|
||||
|
||||
def lmdeploy_llm(messages: list[dict],
|
||||
model: str = "qwen-7b",
|
||||
seed: int = 1234,
|
||||
max_tokens: int = 2000,
|
||||
temperature: float = 1,
|
||||
stop: list = None,
|
||||
enable_search: bool = False,
|
||||
logger=None) -> str:
|
||||
|
||||
if logger:
|
||||
logger.debug(f'messages:\n {messages}')
|
||||
logger.debug(f'params:\n model: {model}, max_tokens: {max_tokens}, temperature: {temperature}, stop: {stop},'
|
||||
f'enable_search: {enable_search}, seed: {seed}')
|
||||
|
||||
top_p = 0.7
|
||||
url = os.environ.get('LLM_API_BASE', "http://127.0.0.1:6003")
|
||||
api_client = LMDeployClient(model_name=model,
|
||||
url=url,
|
||||
meta_template=META,
|
||||
max_new_tokens=max_tokens,
|
||||
top_p=top_p,
|
||||
top_k=100,
|
||||
temperature=temperature,
|
||||
repetition_penalty=1.0,
|
||||
stop_words=['<|im_end|>'])
|
||||
response = ""
|
||||
for i in range(3):
|
||||
try:
|
||||
response = api_client.chat(messages)
|
||||
break
|
||||
except ConnectionError:
|
||||
if logger:
|
||||
logger.warning(f'ConnectionError, url:{url}')
|
||||
else:
|
||||
print(f"ConnectionError, url:{url}")
|
||||
return ""
|
||||
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
from pprint import pprint
|
||||
|
||||
# logging.basicConfig(level=logging.DEBUG)
|
||||
system_content = ''
|
||||
user_content = '''你是一名优秀的翻译,请帮我把如下新闻标题逐条(一行为一条)翻译为中文,你的输出也必须为一条一行的格式。
|
||||
|
||||
The New York Times reported on 2021-01-01 that the COVID-19 cases in China are increasing.
|
||||
Cyber ops linked to Israel-Hamas conflict largely improvised, researchers say
|
||||
Russian hackers disrupted Ukrainian electrical grid last year
|
||||
Reform bill would overhaul controversial surveillance law
|
||||
GitHub disables pro-Russian hacktivist DDoS pages
|
||||
Notorious Russian hacking group appears to resurface with fresh cyberattacks on Ukraine
|
||||
Russian hackers attempted to breach petroleum refining company in NATO country, researchers say
|
||||
As agencies move towards multi-cloud networks, proactive security is key
|
||||
Keeping a competitive edge in the cybersecurity ‘game’
|
||||
Mud, sweat and data: The hard work of democratizing data at scale
|
||||
SEC sues SolarWinds and CISO for fraud
|
||||
Cyber workforce demand is outpacing supply, survey finds
|
||||
Four dozen countries declare they won
|
||||
White House executive order on AI seeks to address security risks
|
||||
malware resembling NSA code
|
||||
CISA budget cuts would be
|
||||
Hackers that breached Las Vegas casinos rely on violent threats, research shows'''
|
||||
|
||||
data = [{'role': 'user', 'content': user_content}]
|
||||
start_time = time.time()
|
||||
pprint(lmdeploy_llm(data, 'qwen-7b'))
|
||||
print(f'time cost: {time.time() - start_time}')
|
@ -1,15 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
docker run -d --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
--env "LMDEPLOY_USE_MODELSCOPE=True" \
|
||||
--env "TOKENIZERS_PARALLELISM=False" \
|
||||
--name qwen1.5-7b-service \
|
||||
-p 6003:6003 \
|
||||
--restart=always \
|
||||
--ipc=host \
|
||||
openmmlab/lmdeploy:v0.2.5 \
|
||||
pip install modelscope & \
|
||||
lmdeploy serve api_server Qwen/Qwen1.5-7B-Chat \
|
||||
--server-name 0.0.0.0 --server-port 6003 --tp 1 --rope-scaling-factor 1 --backend pytorch
|
@ -1,19 +0,0 @@
|
||||
fastapi
|
||||
pydantic
|
||||
uvicorn
|
||||
dashscope #optional(使用阿里灵积时安装)
|
||||
openai #optional(使用兼容openai sdk的llm服务时安装)
|
||||
volcengine #optional(使用火山翻译时安装)
|
||||
python-docx
|
||||
BCEmbedding==0.1.3
|
||||
langchain==0.1.0
|
||||
langchain-community==0.0.9
|
||||
langchain-core==0.1.7
|
||||
langsmith==0.0.77
|
||||
# faiss-gpu for gpu environment
|
||||
faiss-cpu # for cpu-only environment
|
||||
pocketbase==0.10.0
|
||||
gne
|
||||
chardet
|
||||
schedule
|
||||
loguru
|
@ -1,4 +0,0 @@
|
||||
|
||||
|
||||
scraper_map = {
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
set -o allexport
|
||||
source ../.env
|
||||
set +o allexport
|
||||
python background_task.py
|
@ -1,158 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from datetime import datetime, timedelta, date
|
||||
from scrapers import scraper_map
|
||||
from scrapers.general_scraper import general_scraper
|
||||
from urllib.parse import urlparse
|
||||
from get_insight import get_insight, pb, logger
|
||||
from general_utils import is_chinese
|
||||
from tranlsation_volcengine import text_translate
|
||||
import concurrent.futures
|
||||
|
||||
|
||||
# 一般用于第一次爬虫时,避免抓取过多太久的文章,同时超过这个天数的数据库文章也不会再用来匹配insight
|
||||
expiration_date = datetime.now() - timedelta(days=90)
|
||||
expiration_date = expiration_date.date()
|
||||
expiration_str = expiration_date.strftime("%Y%m%d")
|
||||
|
||||
|
||||
class ServiceProcesser:
|
||||
def __init__(self, record_snapshot: bool = False):
|
||||
self.project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
# 1. base initialization
|
||||
self.cache_url = os.path.join(self.project_dir, 'scanning_task')
|
||||
os.makedirs(self.cache_url, exist_ok=True)
|
||||
# 2. load the llm
|
||||
# self.llm = LocalLlmWrapper() # if you use the local-llm
|
||||
|
||||
if record_snapshot:
|
||||
snap_short_server = os.environ.get('SNAPSHOTS_SERVER', '')
|
||||
if not snap_short_server:
|
||||
raise Exception('SNAPSHOTS_SERVER is not set.')
|
||||
self.snap_short_server = f"http://{snap_short_server}"
|
||||
else:
|
||||
self.snap_short_server = None
|
||||
|
||||
logger.info('scanning task init success.')
|
||||
|
||||
def __call__(self, expiration: date = expiration_date, sites: list[str] = None):
|
||||
# 先清空一下cache
|
||||
logger.info(f'wake, prepare to work, now is {datetime.now()}')
|
||||
cache = {}
|
||||
logger.debug(f'clear cache -- {cache}')
|
||||
# 从pb数据库中读取所有文章url
|
||||
# 这里publish_time用int格式,综合考虑下这个是最容易操作的模式,虽然糙了点
|
||||
existing_articles = pb.read(collection_name='articles', fields=['id', 'title', 'url'], filter=f'publish_time>{expiration_str}')
|
||||
all_title = {}
|
||||
existings = []
|
||||
for article in existing_articles:
|
||||
all_title[article['title']] = article['id']
|
||||
existings.append(article['url'])
|
||||
|
||||
# 定义扫描源列表,如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况,这时应该使用通用爬虫
|
||||
sources = sites if sites else list(scraper_map.keys())
|
||||
new_articles = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
futures = []
|
||||
for site in sources:
|
||||
if site in scraper_map:
|
||||
futures.append(executor.submit(scraper_map[site], expiration, existings))
|
||||
else:
|
||||
futures.append(executor.submit(general_scraper, site, expiration, existings, logger))
|
||||
concurrent.futures.wait(futures)
|
||||
for future in futures:
|
||||
try:
|
||||
new_articles.extend(future.result())
|
||||
except Exception as e:
|
||||
logger.error(f'error when scraping-- {e}')
|
||||
|
||||
for value in new_articles:
|
||||
if not value:
|
||||
continue
|
||||
from_site = urlparse(value['url']).netloc
|
||||
from_site = from_site.replace('www.', '')
|
||||
from_site = from_site.split('.')[0]
|
||||
if value['abstract']:
|
||||
value['abstract'] = f"({from_site} 报道){value['abstract']}"
|
||||
value['content'] = f"({from_site} 报道){value['content']}"
|
||||
value['images'] = json.dumps(value['images'])
|
||||
|
||||
article_id = pb.add(collection_name='articles', body=value)
|
||||
|
||||
if article_id:
|
||||
cache[article_id] = value
|
||||
all_title[value['title']] = article_id
|
||||
else:
|
||||
logger.warning(f'add article {value} failed, writing to cache_file')
|
||||
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
|
||||
json.dump(value, f, ensure_ascii=False, indent=4)
|
||||
|
||||
if not cache:
|
||||
logger.warning(f'no new articles. now is {datetime.now()}')
|
||||
return
|
||||
|
||||
# insight 流程
|
||||
new_insights = get_insight(cache, all_title)
|
||||
if new_insights:
|
||||
for insight in new_insights:
|
||||
if not insight['content']:
|
||||
continue
|
||||
insight_id = pb.add(collection_name='insights', body=insight)
|
||||
if not insight_id:
|
||||
logger.warning(f'write insight {insight} to pb failed, writing to cache_file')
|
||||
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a', encoding='utf-8') as f:
|
||||
json.dump(insight, f, ensure_ascii=False, indent=4)
|
||||
for article_id in insight['articles']:
|
||||
raw_article = pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
|
||||
if not raw_article or not raw_article[0]:
|
||||
logger.warning(f'get article {article_id} failed, skipping')
|
||||
continue
|
||||
if raw_article[0]['translation_result']:
|
||||
continue
|
||||
if is_chinese(raw_article[0]['title']):
|
||||
continue
|
||||
translate_text = text_translate([raw_article[0]['title'], raw_article[0]['abstract']], target_language='zh', logger=logger)
|
||||
if translate_text:
|
||||
related_id = pb.add(collection_name='article_translation', body={'title': translate_text[0], 'abstract': translate_text[1], 'raw': article_id})
|
||||
if not related_id:
|
||||
logger.warning(f'write article_translation {article_id} failed')
|
||||
else:
|
||||
_ = pb.update(collection_name='articles', id=article_id, body={'translation_result': related_id})
|
||||
if not _:
|
||||
logger.warning(f'update article {article_id} failed')
|
||||
else:
|
||||
logger.warning(f'translate article {article_id} failed')
|
||||
else:
|
||||
# 尝试把所有文章的title作为insigts,这是备选方案
|
||||
if len(cache) < 25:
|
||||
logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
|
||||
for key, value in cache.items():
|
||||
if value['title']:
|
||||
if is_chinese(value['title']):
|
||||
text_for_insight = value['title']
|
||||
else:
|
||||
text_for_insight = text_translate([value['title']], logger=logger)
|
||||
if text_for_insight:
|
||||
insight_id = pb.add(collection_name='insights', body={'content': text_for_insight[0], 'articles': [key]})
|
||||
if not insight_id:
|
||||
logger.warning(f'write insight {text_for_insight[0]} to pb failed, writing to cache_file')
|
||||
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a',
|
||||
encoding='utf-8') as f:
|
||||
json.dump({'content': text_for_insight[0], 'articles': [key]}, f, ensure_ascii=False, indent=4)
|
||||
else:
|
||||
logger.warning('generate_insights-error: can not generate insights, pls re-try')
|
||||
logger.info(f'work done, now is {datetime.now()}')
|
||||
|
||||
if self.snap_short_server:
|
||||
logger.info(f'now starting article snapshot with {self.snap_short_server}')
|
||||
for key, value in cache.items():
|
||||
if value['url']:
|
||||
try:
|
||||
snapshot = requests.get(f"{self.snap_short_server}/zip", {'url': value['url']}, timeout=60)
|
||||
file = open(snapshot.text, 'rb')
|
||||
_ = pb.upload('articles', key, 'snapshot', key, file)
|
||||
file.close()
|
||||
except Exception as e:
|
||||
logger.warning(f'error when snapshot {value["url"]}, {e}')
|
||||
logger.info(f'now snapshot done, now is {datetime.now()}')
|
@ -1,31 +0,0 @@
|
||||
services:
|
||||
web:
|
||||
build:
|
||||
dockerfile: Dockerfile.web
|
||||
image: wiseflow/web
|
||||
ports:
|
||||
- 8090:8090
|
||||
# env_file:
|
||||
# - .env
|
||||
volumes:
|
||||
- ./pb/pb_data:/pb/pb_data
|
||||
# - ./${PROJECT_DIR}:/pb/${PROJECT_DIR}
|
||||
entrypoint: /pb/pocketbase serve --http=0.0.0.0:8090
|
||||
|
||||
api:
|
||||
build:
|
||||
dockerfile: Dockerfile.api
|
||||
image: wiseflow/api
|
||||
tty: true
|
||||
stdin_open: true
|
||||
entrypoint: bash docker_entrypoint.sh
|
||||
env_file:
|
||||
- .env
|
||||
ports:
|
||||
- 7777:7777
|
||||
volumes:
|
||||
- ./${PROJECT_DIR}:/app/${PROJECT_DIR}
|
||||
- ${EMBEDDING_MODEL_PATH}:${EMBEDDING_MODEL_PATH}
|
||||
- ${RERANKER_MODEL_PATH}:${RERANKER_MODEL_PATH}
|
||||
depends_on:
|
||||
- web
|
@ -1,14 +0,0 @@
|
||||
export LLM_API_KEY=""
|
||||
export LLM_API_BASE="" ##使用本地模型服务或者使用openai_wrapper调用非openai服务时用
|
||||
export VOLC_KEY="AK|SK"
|
||||
|
||||
#**for embeddig model**
|
||||
export EMBEDDING_MODEL_PATH="" ##填写完整的绝对路径
|
||||
export RERANKER_MODEL_PATH="" ##填写完整的绝对路径
|
||||
export DEVICE="cpu" ##cuda用户填写 "cuda:0"
|
||||
|
||||
#**for processer**
|
||||
export PROJECT_DIR="work_dir"
|
||||
export PB_API_AUTH="test@example.com|123467890"
|
||||
export PB_API_BASE="web:8090" ##可以参考https://stackoverflow.com/questions/70151702/how-to-network-2-separate-docker-containers-to-communicate-with-eachother
|
||||
export WS_LOG="verbose" ##如果需要详细的log,观察系统的每一步动作填写此项,正常使用无需
|
@ -1 +0,0 @@
|
||||
v0.2.1
|
47
core/dm.py
Normal file
47
core/dm.py
Normal file
@ -0,0 +1,47 @@
|
||||
import asyncio
|
||||
import websockets
|
||||
import concurrent.futures
|
||||
import json
|
||||
from insights import pipeline
|
||||
|
||||
|
||||
async def get_public_msg():
|
||||
uri = "ws://127.0.0.1:8066/ws/publicMsg"
|
||||
reconnect_attempts = 0
|
||||
max_reconnect_attempts = 3 # 可以根据需要设置最大重连次数
|
||||
|
||||
while True:
|
||||
try:
|
||||
async with websockets.connect(uri, max_size=10 * 1024 * 1024) as websocket:
|
||||
loop = asyncio.get_running_loop()
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
while True:
|
||||
response = await websocket.recv()
|
||||
datas = json.loads(response)
|
||||
for data in datas["data"]:
|
||||
if data["IsSender"] != "0":
|
||||
print('self-send message, pass')
|
||||
print(data)
|
||||
continue
|
||||
input_data = {
|
||||
"user_id": data["StrTalker"],
|
||||
"type": "publicMsg",
|
||||
"content": data["Content"],
|
||||
"addition": data["MsgSvrID"]
|
||||
}
|
||||
await loop.run_in_executor(pool, pipeline, input_data)
|
||||
except websockets.exceptions.ConnectionClosedError as e:
|
||||
print(f"Connection closed with exception: {e}")
|
||||
reconnect_attempts += 1
|
||||
if reconnect_attempts <= max_reconnect_attempts:
|
||||
print(f"Reconnecting attempt {reconnect_attempts}...")
|
||||
await asyncio.sleep(5) # 等待一段时间后重试
|
||||
else:
|
||||
print("Max reconnect attempts reached. Exiting.")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
break
|
||||
|
||||
# 使用asyncio事件循环运行get_public_msg coroutine
|
||||
asyncio.run(get_public_msg())
|
164
core/insights/__init__.py
Normal file
164
core/insights/__init__.py
Normal file
@ -0,0 +1,164 @@
|
||||
from scrapers import *
|
||||
from utils.general_utils import extract_urls, compare_phrase_with_list
|
||||
from insights.get_info import get_info, pb, project_dir, logger
|
||||
from insights.rewrite import info_rewrite
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
import time
|
||||
|
||||
|
||||
# 用正则不用xml解析方案是因为公众号消息提取出来的xml代码存在异常字符
|
||||
item_pattern = re.compile(r'<item>(.*?)</item>', re.DOTALL)
|
||||
url_pattern = re.compile(r'<url><!\[CDATA\[(.*?)]]></url>')
|
||||
summary_pattern = re.compile(r'<summary><!\[CDATA\[(.*?)]]></summary>', re.DOTALL)
|
||||
|
||||
expiration_days = 3
|
||||
existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
|
||||
|
||||
|
||||
def pipeline(_input: dict):
|
||||
cache = {}
|
||||
source = _input['user_id'].split('@')[-1]
|
||||
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
|
||||
|
||||
if _input['type'] == 'publicMsg':
|
||||
items = item_pattern.findall(_input["content"])
|
||||
# 遍历所有<item>内容,提取<url>和<summary>
|
||||
for item in items:
|
||||
url_match = url_pattern.search(item)
|
||||
url = url_match.group(1) if url_match else None
|
||||
if not url:
|
||||
logger.warning(f"can not find url in \n{item}")
|
||||
continue
|
||||
# url处理,http换成https, 去掉chksm之后的部分
|
||||
url = url.replace('http://', 'https://')
|
||||
cut_off_point = url.find('chksm=')
|
||||
if cut_off_point != -1:
|
||||
url = url[:cut_off_point-1]
|
||||
if url in cache:
|
||||
logger.debug(f"{url} already find in item")
|
||||
continue
|
||||
summary_match = summary_pattern.search(item)
|
||||
summary = summary_match.group(1) if summary_match else None
|
||||
cache[url] = summary
|
||||
urls = list(cache.keys())
|
||||
|
||||
elif _input['type'] == 'text':
|
||||
urls = extract_urls(_input['content'])
|
||||
if not urls:
|
||||
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
|
||||
return
|
||||
elif _input['type'] == 'url':
|
||||
urls = []
|
||||
pass
|
||||
else:
|
||||
return
|
||||
|
||||
global existing_urls
|
||||
|
||||
for url in urls:
|
||||
# 0、先检查是否已经爬取过
|
||||
if url in existing_urls:
|
||||
logger.debug(f"{url} has been crawled, skip")
|
||||
continue
|
||||
|
||||
logger.debug(f"fetching {url}")
|
||||
# 1、选择合适的爬虫fetch article信息
|
||||
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
|
||||
flag, article = mp_crawler(url, logger)
|
||||
if flag == -7:
|
||||
# 对于mp爬虫,-7 的大概率是被微信限制了,等待1min即可
|
||||
logger.info(f"fetch {url} failed, try to wait 1min and try again")
|
||||
time.sleep(60)
|
||||
flag, article = mp_crawler(url, logger)
|
||||
else:
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
if domain in scraper_map:
|
||||
flag, article = scraper_map[domain](url, logger)
|
||||
else:
|
||||
flag, article = simple_crawler(url, logger)
|
||||
|
||||
if flag == -7:
|
||||
# -7 代表网络不同,用其他爬虫也没有效果
|
||||
logger.info(f"cannot fetch {url}")
|
||||
continue
|
||||
|
||||
if flag != 11:
|
||||
logger.info(f"{url} failed with mp_crawler and simple_crawler")
|
||||
flag, article = llm_crawler(url, logger)
|
||||
if flag != 11:
|
||||
logger.info(f"{url} failed with llm_crawler")
|
||||
continue
|
||||
|
||||
# 2、判断是否早于 当日- expiration_days ,如果是的话,舍弃
|
||||
expiration_date = datetime.now() - timedelta(days=expiration_days)
|
||||
expiration_date = expiration_date.strftime('%Y-%m-%d')
|
||||
article_date = int(article['publish_time'])
|
||||
if article_date < int(expiration_date.replace('-', '')):
|
||||
logger.info(f"publish date is {article_date}, too old, skip")
|
||||
continue
|
||||
|
||||
article['source'] = source
|
||||
if cache[url]:
|
||||
article['abstract'] = cache[url]
|
||||
|
||||
# 3、使用content从中提炼信息
|
||||
insights = get_info(f"标题:{article['title']}\n\n内容:{article['content']}")
|
||||
# 提炼info失败的article不入库,不然在existing里面后面就再也不会处理了,但提炼成功没有insight的article需要入库,后面不再分析。
|
||||
|
||||
# 4、article入库
|
||||
try:
|
||||
article_id = pb.add(collection_name='articles', body=article)
|
||||
except Exception as e:
|
||||
logger.error(f'add article failed, writing to cache_file - {e}')
|
||||
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
|
||||
json.dump(article, f, ensure_ascii=False, indent=4)
|
||||
continue
|
||||
|
||||
existing_urls.append(url)
|
||||
|
||||
if not insights:
|
||||
continue
|
||||
# insight 比对去重与合并, article打标签,insight入库
|
||||
article_tags = set()
|
||||
# 从数据库中读取过去expiration_days的insight记录,避免重复
|
||||
old_insights = pb.read(collection_name='insights', filter=f"updated>'{expiration_date}'", fields=['id', 'tag', 'content', 'articles'])
|
||||
for insight in insights:
|
||||
article_tags.add(insight['tag'])
|
||||
insight['articles'] = [article_id]
|
||||
# 从old_insights 中挑出相同tag的insight,组成 content: id 的反查字典
|
||||
old_insight_dict = {i['content']: i for i in old_insights if i['tag'] == insight['tag']}
|
||||
# 因为要比较的是抽取出来的信息短语是否讲的是一个事情,用向量模型计算相似度未必适合且过重
|
||||
# 因此这里使用一个简化的方案,直接使用jieba分词器,计算两个短语之间重叠的词语是否超过90%
|
||||
similar_insights = compare_phrase_with_list(insight['content'], list(old_insight_dict.keys()), 0.65)
|
||||
if similar_insights:
|
||||
to_rewrite = similar_insights + [insight['content']]
|
||||
new_info_content = info_rewrite(to_rewrite, logger)
|
||||
if not new_info_content:
|
||||
continue
|
||||
insight['content'] = new_info_content
|
||||
# 合并关联article、删除旧insight
|
||||
for old_insight in similar_insights:
|
||||
insight['articles'].extend(old_insight_dict[old_insight]['articles'])
|
||||
pb.delete(collection_name='insights', id=old_insight_dict[old_insight]['id'])
|
||||
old_insights.remove(old_insight_dict[old_insight])
|
||||
|
||||
try:
|
||||
insight['id'] = pb.add(collection_name='insights', body=insight)
|
||||
# old_insights.append(insight)
|
||||
except Exception as e:
|
||||
logger.error(f'add insight failed, writing to cache_file - {e}')
|
||||
with open(os.path.join(project_dir, 'cache_insights.json'), 'a', encoding='utf-8') as f:
|
||||
json.dump(insight, f, ensure_ascii=False, indent=4)
|
||||
|
||||
try:
|
||||
pb.update(collection_name='articles', id=article_id, body={'tag': list(article_tags)})
|
||||
except Exception as e:
|
||||
logger.error(f'update article failed - article_id: {article_id}\n{e}')
|
||||
article['tag'] = list(article_tags)
|
||||
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
|
||||
json.dump(article, f, ensure_ascii=False, indent=4)
|
94
core/insights/get_info.py
Normal file
94
core/insights/get_info.py
Normal file
@ -0,0 +1,94 @@
|
||||
# from llms.dashscope_wrapper import dashscope_llm
|
||||
from llms.openai_wrapper import openai_llm
|
||||
# from llms.siliconflow_wrapper import sfa_llm
|
||||
import re
|
||||
from utils.general_utils import get_logger_level
|
||||
from loguru import logger
|
||||
from utils.pb_api import PbTalker
|
||||
import os
|
||||
|
||||
|
||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
if project_dir:
|
||||
os.makedirs(project_dir, exist_ok=True)
|
||||
logger_file = os.path.join(project_dir, 'insights.log')
|
||||
dsw_log = get_logger_level()
|
||||
logger.add(
|
||||
logger_file,
|
||||
level=dsw_log,
|
||||
backtrace=True,
|
||||
diagnose=True,
|
||||
rotation="50 MB"
|
||||
)
|
||||
|
||||
pb = PbTalker(logger)
|
||||
|
||||
model = "glm-4-flash"
|
||||
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
|
||||
focus_list = [item["name"] for item in focus_data if item["name"]]
|
||||
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
|
||||
|
||||
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下:
|
||||
{focus_list}
|
||||
|
||||
如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型,并提供仅包含时间、地点、人物和事件的一句话信息摘要:
|
||||
<tag>类型名称</tag>仅包含时间、地点、人物和事件的一句话信息摘要
|
||||
|
||||
如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。
|
||||
务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的tag,不要重复输出;3、仅用一句话做信息摘要,且仅包含时间、地点、人物和事件;4、严格遵循给定的格式输出。'''
|
||||
|
||||
# pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
|
||||
|
||||
|
||||
def get_info(article_content: str) -> list[dict]:
|
||||
# logger.debug(f'receive new article_content:\n{article_content}')
|
||||
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
|
||||
model=model, logger=logger)
|
||||
|
||||
# results = pattern.findall(result)
|
||||
texts = result.split('<tag>')
|
||||
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
|
||||
if not texts:
|
||||
logger.info(f'can not find info, llm result:\n{result}')
|
||||
return []
|
||||
|
||||
cache = []
|
||||
for text in texts:
|
||||
try:
|
||||
strings = text.split('</tag>')
|
||||
tag = strings[0]
|
||||
tag = tag.strip()
|
||||
if tag not in focus_list:
|
||||
logger.info(f'tag not in focus_list: {tag}, aborting')
|
||||
continue
|
||||
info = ''.join(strings[1:])
|
||||
info = info.strip()
|
||||
except Exception as e:
|
||||
logger.info(f'parse error: {e}')
|
||||
tag = ''
|
||||
info = ''
|
||||
|
||||
if not info or not tag:
|
||||
logger.info(f'parse failed-{text}')
|
||||
continue
|
||||
|
||||
if len(info) < 7:
|
||||
logger.info(f'info too short, possible invalid: {info}')
|
||||
continue
|
||||
|
||||
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
|
||||
logger.info(f'no relevant info: {text}')
|
||||
continue
|
||||
|
||||
while info.endswith('"'):
|
||||
info = info[:-1]
|
||||
info = info.strip()
|
||||
|
||||
# 拼接下来源信息
|
||||
sources = re.findall(r'内容:\((.*?) 文章\)', article_content)
|
||||
if sources and sources[0]:
|
||||
info = f"【{sources[0]} 公众号】 {info}"
|
||||
|
||||
cache.append({'content': info, 'tag': focus_dict[tag]})
|
||||
|
||||
return cache
|
24
core/insights/rewrite.py
Normal file
24
core/insights/rewrite.py
Normal file
@ -0,0 +1,24 @@
|
||||
# from llms.openai_wrapper import openai_llm
|
||||
from llms.dashscope_wrapper import dashscope_llm
|
||||
# from llms.siliconflow_wrapper import sfa_llm
|
||||
|
||||
|
||||
rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。
|
||||
给到的内容会用XML标签分隔。
|
||||
请仅输出总结出的摘要,不要输出其他的信息。'''
|
||||
|
||||
model = "qwen2-7b-instruct"
|
||||
|
||||
|
||||
def info_rewrite(contents: list[str], logger=None) -> str:
|
||||
context = f"<content>{'</content><content>'.join(contents)}</content>"
|
||||
try:
|
||||
result = dashscope_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
|
||||
model=model, temperature=0.1, logger=logger)
|
||||
return result.strip()
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.warning(f'rewrite process llm generate failed: {e}')
|
||||
else:
|
||||
print(f'rewrite process llm generate failed: {e}')
|
||||
return ''
|
@ -7,13 +7,13 @@ import os
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
token = os.environ.get('LLM_API_KEY', "")
|
||||
if not token:
|
||||
raise ValueError('请设置环境变量LLM_API_KEY')
|
||||
|
||||
base_url = os.environ.get('LLM_API_BASE', "")
|
||||
token = os.environ.get('LLM_API_KEY', "")
|
||||
|
||||
client = OpenAI(api_key=token, base_url=base_url)
|
||||
if token:
|
||||
client = OpenAI(api_key=token, base_url=base_url)
|
||||
else:
|
||||
client = OpenAI(base_url=base_url)
|
||||
|
||||
|
||||
def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
|
70
core/llms/siliconflow_wrapper.py
Normal file
70
core/llms/siliconflow_wrapper.py
Normal file
@ -0,0 +1,70 @@
|
||||
"""
|
||||
siliconflow api wrapper
|
||||
https://siliconflow.readme.io/reference/chat-completions-1
|
||||
"""
|
||||
import os
|
||||
import requests
|
||||
|
||||
|
||||
token = os.environ.get('LLM_API_KEY', "")
|
||||
if not token:
|
||||
raise ValueError('请设置环境变量LLM_API_KEY')
|
||||
|
||||
url = "https://api.siliconflow.cn/v1/chat/completions"
|
||||
|
||||
|
||||
def sfa_llm(messages: list, model: str, logger=None, **kwargs) -> str:
|
||||
|
||||
if logger:
|
||||
logger.debug(f'messages:\n {messages}')
|
||||
logger.debug(f'model: {model}')
|
||||
logger.debug(f'kwargs:\n {kwargs}')
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages
|
||||
}
|
||||
|
||||
payload.update(kwargs)
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/json",
|
||||
"authorization": f"Bearer {token}"
|
||||
}
|
||||
|
||||
for i in range(2):
|
||||
try:
|
||||
response = requests.post(url, json=payload, headers=headers)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
body = response.json()
|
||||
usage = body.get('usage', 'Field "usage" not found')
|
||||
choices = body.get('choices', 'Field "choices" not found')
|
||||
if logger:
|
||||
logger.debug(choices)
|
||||
logger.debug(usage)
|
||||
return choices[0]['message']['content']
|
||||
except ValueError:
|
||||
# 如果响应体不是有效的JSON格式
|
||||
if logger:
|
||||
logger.warning("Response body is not in JSON format.")
|
||||
else:
|
||||
print("Response body is not in JSON format.")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if logger:
|
||||
logger.warning(f"A request error occurred: {e}")
|
||||
else:
|
||||
print(f"A request error occurred: {e}")
|
||||
|
||||
if logger:
|
||||
logger.info("retrying...")
|
||||
else:
|
||||
print("retrying...")
|
||||
|
||||
if logger:
|
||||
logger.error("After many time, finally failed to get response from API.")
|
||||
else:
|
||||
print("After many time, finally failed to get response from API.")
|
||||
|
||||
return ''
|
1016
core/pb/CHANGELOG.md
Normal file
1016
core/pb/CHANGELOG.md
Normal file
File diff suppressed because it is too large
Load Diff
17
core/pb/LICENSE.md
Normal file
17
core/pb/LICENSE.md
Normal file
@ -0,0 +1,17 @@
|
||||
The MIT License (MIT)
|
||||
Copyright (c) 2022 - present, Gani Georgiev
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software
|
||||
and associated documentation files (the "Software"), to deal in the Software without restriction,
|
||||
including without limitation the rights to use, copy, modify, merge, publish, distribute,
|
||||
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or
|
||||
substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
|
||||
BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
44
core/pb/pb_migrations/1714803585_updated_articles.js
Normal file
44
core/pb/pb_migrations/1714803585_updated_articles.js
Normal file
@ -0,0 +1,44 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "iorna912",
|
||||
"name": "content",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "iorna912",
|
||||
"name": "content",
|
||||
"type": "text",
|
||||
"required": true,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
31
core/pb/pb_migrations/1714835361_updated_insights.js
Normal file
31
core/pb/pb_migrations/1714835361_updated_insights.js
Normal file
@ -0,0 +1,31 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "d13734ez",
|
||||
"name": "tag",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("d13734ez")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
31
core/pb/pb_migrations/1714955881_updated_articles.js
Normal file
31
core/pb/pb_migrations/1714955881_updated_articles.js
Normal file
@ -0,0 +1,31 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "pwy2iz0b",
|
||||
"name": "source",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("pwy2iz0b")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
51
core/pb/pb_migrations/1715823361_created_tags.js
Normal file
51
core/pb/pb_migrations/1715823361_created_tags.js
Normal file
@ -0,0 +1,51 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const collection = new Collection({
|
||||
"id": "nvf6k0yoiclmytu",
|
||||
"created": "2024-05-16 01:36:01.108Z",
|
||||
"updated": "2024-05-16 01:36:01.108Z",
|
||||
"name": "tags",
|
||||
"type": "base",
|
||||
"system": false,
|
||||
"schema": [
|
||||
{
|
||||
"system": false,
|
||||
"id": "0th8uax4",
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"system": false,
|
||||
"id": "l6mm7m90",
|
||||
"name": "activated",
|
||||
"type": "bool",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"indexes": [],
|
||||
"listRule": null,
|
||||
"viewRule": null,
|
||||
"createRule": null,
|
||||
"updateRule": null,
|
||||
"deleteRule": null,
|
||||
"options": {}
|
||||
});
|
||||
|
||||
return Dao(db).saveCollection(collection);
|
||||
}, (db) => {
|
||||
const dao = new Dao(db);
|
||||
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu");
|
||||
|
||||
return dao.deleteCollection(collection);
|
||||
})
|
52
core/pb/pb_migrations/1715824265_updated_insights.js
Normal file
52
core/pb/pb_migrations/1715824265_updated_insights.js
Normal file
@ -0,0 +1,52 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("d13734ez")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "j65p3jji",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": null,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "d13734ez",
|
||||
"name": "tag",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("j65p3jji")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
16
core/pb/pb_migrations/1715852342_updated_insights.js
Normal file
16
core/pb/pb_migrations/1715852342_updated_insights.js
Normal file
@ -0,0 +1,16 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
collection.listRule = null
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
16
core/pb/pb_migrations/1715852638_updated_insights.js
Normal file
16
core/pb/pb_migrations/1715852638_updated_insights.js
Normal file
@ -0,0 +1,16 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
collection.viewRule = null
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
33
core/pb/pb_migrations/1715852847_updated_users.js
Normal file
33
core/pb/pb_migrations/1715852847_updated_users.js
Normal file
@ -0,0 +1,33 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "8d9woe75",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": null,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("8d9woe75")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
33
core/pb/pb_migrations/1715852924_updated_articles.js
Normal file
33
core/pb/pb_migrations/1715852924_updated_articles.js
Normal file
@ -0,0 +1,33 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "famdh2fv",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": null,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("famdh2fv")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
18
core/pb/pb_migrations/1715852932_updated_articles.js
Normal file
18
core/pb/pb_migrations/1715852932_updated_articles.js
Normal file
@ -0,0 +1,18 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
|
||||
|
||||
collection.listRule = null
|
||||
collection.viewRule = null
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
@ -0,0 +1,33 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
|
||||
|
||||
// add
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "lbxw5pra",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": null,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
|
||||
|
||||
// remove
|
||||
collection.schema.removeField("lbxw5pra")
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
@ -0,0 +1,18 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
|
||||
|
||||
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
|
||||
|
||||
collection.listRule = null
|
||||
collection.viewRule = null
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
44
core/pb/pb_migrations/1716165809_updated_tags.js
Normal file
44
core/pb/pb_migrations/1716165809_updated_tags.js
Normal file
@ -0,0 +1,44 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "0th8uax4",
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"required": true,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "0th8uax4",
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"min": null,
|
||||
"max": null,
|
||||
"pattern": ""
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
48
core/pb/pb_migrations/1716168332_updated_insights.js
Normal file
48
core/pb/pb_migrations/1716168332_updated_insights.js
Normal file
@ -0,0 +1,48 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "j65p3jji",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": 1,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
|
||||
|
||||
// update
|
||||
collection.schema.addField(new SchemaField({
|
||||
"system": false,
|
||||
"id": "j65p3jji",
|
||||
"name": "tag",
|
||||
"type": "relation",
|
||||
"required": false,
|
||||
"presentable": false,
|
||||
"unique": false,
|
||||
"options": {
|
||||
"collectionId": "nvf6k0yoiclmytu",
|
||||
"cascadeDelete": false,
|
||||
"minSelect": null,
|
||||
"maxSelect": null,
|
||||
"displayFields": null
|
||||
}
|
||||
}))
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
18
core/pb/pb_migrations/1717321896_updated_tags.js
Normal file
18
core/pb/pb_migrations/1717321896_updated_tags.js
Normal file
@ -0,0 +1,18 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
|
||||
|
||||
collection.listRule = "@request.auth.id != \"\""
|
||||
collection.viewRule = "@request.auth.id != \"\""
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
}, (db) => {
|
||||
const dao = new Dao(db)
|
||||
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
|
||||
|
||||
collection.listRule = null
|
||||
collection.viewRule = null
|
||||
|
||||
return dao.saveCollection(collection)
|
||||
})
|
6
core/scrapers/__init__.py
Normal file
6
core/scrapers/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
from .mp_crawler import mp_crawler
|
||||
from .simple_crawler import simple_crawler
|
||||
from .general_scraper import llm_crawler
|
||||
|
||||
|
||||
scraper_map = {}
|
@ -6,12 +6,16 @@ import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Comment
|
||||
from llms.dashscope_wrapper import dashscope_llm
|
||||
# from llms.openai_wrapper import openai_llm
|
||||
# from llms.siliconflow_wrapper import sfa_llm
|
||||
from datetime import datetime, date
|
||||
from requests.compat import urljoin
|
||||
import chardet
|
||||
from general_utils import extract_and_convert_dates
|
||||
from utils.general_utils import extract_and_convert_dates
|
||||
|
||||
|
||||
model = "qwen-long"
|
||||
# model = "deepseek-chat"
|
||||
header = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||
|
||||
@ -62,7 +66,7 @@ def parse_html_content(out: str) -> dict:
|
||||
|
||||
|
||||
# qwen1.5-72b解析json格式太容易出错,网页上的情况太多,比如经常直接使用英文的",这样后面json.loads就容易出错……
|
||||
sys_info = '''你是一个html网页解析器,你将接收一段用户从网页html文件中提取的文本,请解析出其标题、摘要、内容和发布日期,发布日期格式为YYYY-MM-DD。
|
||||
sys_info = '''你是一个html解析器,你将接收一段html代码,请解析出其标题、摘要、内容和发布日期,发布日期格式为YYYY-MM-DD。
|
||||
结果请按照以下格式返回(整体用三引号包裹):
|
||||
"""
|
||||
标题||摘要||内容||发布日期XXXX-XX-XX
|
||||
@ -105,7 +109,8 @@ def llm_crawler(url: str | Path, logger) -> (int, dict):
|
||||
{"role": "system", "content": sys_info},
|
||||
{"role": "user", "content": html_text}
|
||||
]
|
||||
llm_output = dashscope_llm(messages, "qwen1.5-72b-chat", logger=logger)
|
||||
llm_output = dashscope_llm(messages, model=model, logger=logger)
|
||||
# llm_output = openai_llm(messages, model=model, logger=logger)
|
||||
try:
|
||||
info = parse_html_content(llm_output)
|
||||
except Exception:
|
109
core/scrapers/mp_crawler.py
Normal file
109
core/scrapers/mp_crawler.py
Normal file
@ -0,0 +1,109 @@
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
|
||||
header = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||
|
||||
|
||||
def mp_crawler(url: str, logger) -> (int, dict):
|
||||
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
||||
logger.warning(f'{url} is not a mp url, you should not use this function')
|
||||
return -5, {}
|
||||
|
||||
url = url.replace("http://", "https://", 1)
|
||||
|
||||
try:
|
||||
with httpx.Client() as client:
|
||||
response = client.get(url, headers=header, timeout=30)
|
||||
except Exception as e:
|
||||
logger.warning(f"cannot get content from {url}\n{e}")
|
||||
return -7, {}
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get the original release date first
|
||||
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
||||
match = re.search(pattern, response.text)
|
||||
|
||||
if match:
|
||||
date_only = match.group(1)
|
||||
publish_time = date_only.replace('-', '')
|
||||
else:
|
||||
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
|
||||
|
||||
# Get description content from < meta > tag
|
||||
try:
|
||||
meta_description = soup.find('meta', attrs={'name': 'description'})
|
||||
summary = meta_description['content'].strip() if meta_description else ''
|
||||
card_info = soup.find('div', id='img-content')
|
||||
# Parse the required content from the < div > tag
|
||||
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
|
||||
if soup.find('h1', id='activity-name') \
|
||||
else soup.find('h1', class_='rich_media_title').text.strip()
|
||||
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
|
||||
if card_info \
|
||||
else soup.find('div', class_='wx_follow_nickname').text.strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"not mp format: {url}\n{e}")
|
||||
return -7, {}
|
||||
|
||||
if not rich_media_title or not profile_nickname:
|
||||
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
||||
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
||||
return -7, {}
|
||||
|
||||
# Parse text and image links within the content interval
|
||||
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
|
||||
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
|
||||
# (but the proportion of this type of article is not high).
|
||||
texts = []
|
||||
images = set()
|
||||
content_area = soup.find('div', id='js_content')
|
||||
if content_area:
|
||||
# 提取文本
|
||||
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
|
||||
text = section.get_text(separator=' ', strip=True)
|
||||
if text and text not in texts:
|
||||
texts.append(text)
|
||||
|
||||
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
|
||||
img_src = img.get('data-src') or img.get('src')
|
||||
if img_src:
|
||||
images.add(img_src)
|
||||
cleaned_texts = [t for t in texts if t.strip()]
|
||||
content = '\n'.join(cleaned_texts)
|
||||
else:
|
||||
logger.warning(f"failed to analysis contents {url}")
|
||||
return 0, {}
|
||||
if content:
|
||||
content = f"({profile_nickname} 文章){content}"
|
||||
else:
|
||||
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
|
||||
# At this time, you can use the summary as the content.
|
||||
content = f"({profile_nickname} 文章){summary}"
|
||||
|
||||
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
twitter_image = soup.find('meta', property='twitter:image')
|
||||
if og_image:
|
||||
images.add(og_image['content'])
|
||||
if twitter_image:
|
||||
images.add(twitter_image['content'])
|
||||
|
||||
if rich_media_title == summary or not summary:
|
||||
abstract = ''
|
||||
else:
|
||||
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
|
||||
|
||||
return 11, {
|
||||
'title': rich_media_title,
|
||||
'author': profile_nickname,
|
||||
'publish_time': publish_time,
|
||||
'abstract': abstract,
|
||||
'content': content,
|
||||
'images': list(images),
|
||||
'url': url,
|
||||
}
|
@ -3,7 +3,7 @@ import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from general_utils import extract_and_convert_dates
|
||||
from utils.general_utils import extract_and_convert_dates
|
||||
import chardet
|
||||
|
||||
|
152
core/tasks.py
Normal file
152
core/tasks.py
Normal file
@ -0,0 +1,152 @@
|
||||
"""
|
||||
通过编辑这个脚本,可以自定义需要的后台任务
|
||||
"""
|
||||
import schedule
|
||||
import time
|
||||
from topnews import pipeline
|
||||
from loguru import logger
|
||||
from utils.pb_api import PbTalker
|
||||
import os
|
||||
from utils.general_utils import get_logger_level
|
||||
from datetime import datetime, timedelta
|
||||
import pytz
|
||||
import requests
|
||||
|
||||
|
||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
if project_dir:
|
||||
os.makedirs(project_dir, exist_ok=True)
|
||||
logger_file = os.path.join(project_dir, 'tasks.log')
|
||||
dsw_log = get_logger_level()
|
||||
logger.add(
|
||||
logger_file,
|
||||
level=dsw_log,
|
||||
backtrace=True,
|
||||
diagnose=True,
|
||||
rotation="50 MB"
|
||||
)
|
||||
|
||||
pb = PbTalker(logger)
|
||||
utc_now = datetime.now(pytz.utc)
|
||||
# 减去一天得到前一天的UTC时间
|
||||
utc_yesterday = utc_now - timedelta(days=1)
|
||||
utc_last = utc_yesterday.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
def task():
|
||||
"""
|
||||
global counter
|
||||
sites = pb.read('sites', filter='activated=True')
|
||||
urls = []
|
||||
for site in sites:
|
||||
if not site['per_hours'] or not site['url']:
|
||||
continue
|
||||
if counter % site['per_hours'] == 0:
|
||||
urls.append(site['url'])
|
||||
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
|
||||
logger.info(urls)
|
||||
if urls:
|
||||
sp(sites=urls)
|
||||
else:
|
||||
if counter % 24 == 0:
|
||||
sp()
|
||||
else:
|
||||
print('\033[0;33mno work for this loop\033[0m')
|
||||
counter += 1
|
||||
"""
|
||||
global utc_last
|
||||
logger.debug(f'last_collect_time: {utc_last}')
|
||||
datas = pb.read(collection_name='insights', filter=f'updated>="{utc_last}"', fields=['id', 'content', 'tag', 'articles'])
|
||||
logger.debug(f"got {len(datas)} items")
|
||||
utc_last = datetime.now(pytz.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.debug(f'now_utc_time: {utc_last}')
|
||||
|
||||
tags = pb.read(collection_name='tags', filter=f'activated=True')
|
||||
tags_dict = {item["id"]: item["name"] for item in tags if item["name"]}
|
||||
top_news = {}
|
||||
for id, name in tags_dict.items():
|
||||
logger.debug(f'tag: {name}')
|
||||
data = [item for item in datas if item['tag'] == id]
|
||||
topnew = pipeline(data, logger)
|
||||
if not topnew:
|
||||
logger.debug(f'no top news for {name}')
|
||||
continue
|
||||
|
||||
top_news[id] = {}
|
||||
for content, articles in topnew.items():
|
||||
content_urls = [pb.read('articles', filter=f'id="{a}"', fields=['url'])[0]['url'] for a in articles]
|
||||
# 去除重叠内容
|
||||
# 如果发现重叠内容,哪个标签长就把对应的从哪个标签删除
|
||||
to_skip = False
|
||||
for k, v in top_news.items():
|
||||
to_del_key = None
|
||||
for c, u in v.items():
|
||||
if not set(content_urls).isdisjoint(set(u)):
|
||||
if len(topnew) > len(v):
|
||||
to_skip = True
|
||||
else:
|
||||
to_del_key = c
|
||||
break
|
||||
if to_del_key:
|
||||
del top_news[k][to_del_key]
|
||||
if to_skip:
|
||||
break
|
||||
if not to_skip:
|
||||
top_news[id][content] = content_urls
|
||||
|
||||
if not top_news[id]:
|
||||
del top_news[id]
|
||||
|
||||
if not top_news:
|
||||
logger.info("no top news today")
|
||||
return
|
||||
|
||||
# 序列化为字符串
|
||||
top_news_text = {"#党建引领基层治理": [],
|
||||
"#数字社区": [],
|
||||
"#优秀活动案例": []}
|
||||
|
||||
for id, v in top_news.items():
|
||||
# top_news[id] = {content: '\n\n'.join(urls) for content, urls in v.items()}
|
||||
top_news[id] = {content: urls[0] for content, urls in v.items()}
|
||||
if id == 's3kqj9ek8nvtthr':
|
||||
top_news_text["#数字社区"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
|
||||
elif id == 'qpcgotbqyz3a617':
|
||||
top_news_text["#优秀活动案例"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
|
||||
else:
|
||||
top_news_text["#党建引领基层治理"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
|
||||
|
||||
top_news_text = {k: "\n".join(v) for k, v in top_news_text.items()}
|
||||
top_news_text = "\n\n".join(f"{k}\n{v}" for k, v in top_news_text.items())
|
||||
logger.info(top_news_text)
|
||||
|
||||
data = {
|
||||
"wxid": "R:10860349446619856",
|
||||
"content": top_news_text
|
||||
}
|
||||
try:
|
||||
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
|
||||
if response.status_code == 200:
|
||||
logger.info("send message to wechat success")
|
||||
time.sleep(1)
|
||||
data = {
|
||||
"wxid": "R:10860349446619856",
|
||||
"content": "[太阳] 今日份的临小助内参来啦!",
|
||||
"atlist": ["@all"]
|
||||
}
|
||||
try:
|
||||
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
|
||||
if response.status_code == 200:
|
||||
logger.info("send notify to wechat success")
|
||||
except Exception as e:
|
||||
logger.error(f"send notify to wechat failed: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"send message to wechat failed: {e}")
|
||||
|
||||
|
||||
schedule.every().day.at("07:38").do(task)
|
||||
|
||||
task()
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60)
|
0
core/utils/__init__.py
Normal file
0
core/utils/__init__.py
Normal file
@ -6,6 +6,7 @@ from urllib.parse import urlparse
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
import jieba
|
||||
|
||||
|
||||
def isURL(string):
|
||||
@ -13,6 +14,15 @@ def isURL(string):
|
||||
return result.scheme != '' and result.netloc != ''
|
||||
|
||||
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
|
||||
urls = re.findall(url_pattern, text)
|
||||
|
||||
# 过滤掉那些只匹配到 'www.' 而没有后续内容的情况,并尝试为每个URL添加默认的http协议前缀以便解析
|
||||
cleaned_urls = [url for url in urls if isURL(url)]
|
||||
return cleaned_urls
|
||||
|
||||
|
||||
def isChinesePunctuation(char):
|
||||
# 定义中文标点符号的Unicode编码范围
|
||||
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
|
||||
@ -162,6 +172,29 @@ def get_logger_level() -> str:
|
||||
return level_map.get(level, 'info')
|
||||
|
||||
|
||||
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
|
||||
"""
|
||||
比较一个目标短语与短语列表中每个短语的相似度。
|
||||
|
||||
:param target_phrase: 目标短语 (str)
|
||||
:param phrase_list: 短语列表 (list of str)
|
||||
:param threshold: 相似度阈值 (float)
|
||||
:return: 满足相似度条件的短语列表 (list of str)
|
||||
"""
|
||||
# 检查目标短语是否为空
|
||||
if not target_phrase:
|
||||
return [] # 目标短语为空,直接返回空列表
|
||||
|
||||
# 预处理:对目标短语和短语列表中的每个短语进行分词
|
||||
target_tokens = set(jieba.lcut(target_phrase))
|
||||
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
|
||||
|
||||
# 比较并筛选
|
||||
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
|
||||
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
|
||||
|
||||
return similar_phrases
|
||||
|
||||
"""
|
||||
# from InternLM/huixiangdou
|
||||
# another awsome work
|
@ -13,28 +13,28 @@ class PbTalker:
|
||||
self.client = PocketBase(url)
|
||||
auth = os.environ.get('PB_API_AUTH', '')
|
||||
if not auth or "|" not in auth:
|
||||
self.logger.warning(f"invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
|
||||
self.logger.warnning("invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
|
||||
else:
|
||||
email, password = auth.split('|')
|
||||
_ = self.client.admins.auth_with_password(email, password)
|
||||
if _:
|
||||
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
|
||||
else:
|
||||
raise Exception(f"pocketbase auth failed")
|
||||
try:
|
||||
admin_data = self.client.admins.auth_with_password(email, password)
|
||||
if admin_data:
|
||||
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
|
||||
except:
|
||||
user_data = self.client.collection("users").auth_with_password(email, password)
|
||||
if user_data:
|
||||
self.logger.info(f"pocketbase ready authenticated as user - {email}")
|
||||
else:
|
||||
raise Exception("pocketbase auth failed")
|
||||
|
||||
def read(self, collection_name: str, fields: list[str] = None, filter: str = '', skiptotal: bool = True) -> list:
|
||||
results = []
|
||||
for i in range(1, 10):
|
||||
try:
|
||||
if fields:
|
||||
res = self.client.collection(collection_name).get_list(i, 500,
|
||||
{"filter": filter,
|
||||
"fields": ','.join(fields),
|
||||
"skiptotal": skiptotal})
|
||||
else:
|
||||
res = self.client.collection(collection_name).get_list(i, 500,
|
||||
{"filter": filter,
|
||||
"skiptotal": skiptotal})
|
||||
res = self.client.collection(collection_name).get_list(i, 500,
|
||||
{"filter": filter,
|
||||
"fields": ','.join(fields) if fields else '',
|
||||
"skiptotal": skiptotal})
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"pocketbase get list failed: {e}")
|
||||
@ -79,3 +79,11 @@ class PbTalker:
|
||||
self.logger.error(f"pocketbase update failed: {e}")
|
||||
return ''
|
||||
return res.id
|
||||
|
||||
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
|
||||
try:
|
||||
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
|
||||
return vars(res)
|
||||
except Exception as e:
|
||||
self.logger.error(f"pocketbase view item failed: {e}")
|
||||
return {}
|
71
dashboard/README.md
Normal file
71
dashboard/README.md
Normal file
@ -0,0 +1,71 @@
|
||||
**Included Web Dashboard Example**: This is optional. If you only use the data processing functions or have your own downstream task program, you can ignore everything in this folder!
|
||||
|
||||
## Main Features
|
||||
|
||||
1.Daily Insights Display
|
||||
2.Daily Article Display
|
||||
3.Appending Search for Specific Hot Topics (using Sogou engine)
|
||||
4.Generating Word Reports for Specific Hot Topics
|
||||
|
||||
**Note: The code here cannot be used directly. It is adapted to an older version of the backend. You need to study the latest backend code in the `core` folder and make changes, especially in parts related to database integration!**
|
||||
|
||||
-----------------------------------------------------------------
|
||||
|
||||
附带的web Dashboard 示例,并非必须,如果你只是使用数据处理功能,或者你有自己的下游任务程序,可以忽略这个文件夹内的一切!
|
||||
|
||||
## 主要功能
|
||||
|
||||
1. 每日insights展示
|
||||
2. 每日文章展示
|
||||
3. 指定热点追加搜索(使用sougou引擎)
|
||||
4. 指定热点生成word报告
|
||||
|
||||
**注意:这里的代码并不能直接使用,它适配的是旧版本的后端程序,你需要研究core文件夹下的最新后端代码,进行更改,尤其是跟数据库对接的部分!**
|
||||
|
||||
-----------------------------------------------------------------
|
||||
|
||||
**付属のWebダッシュボードのサンプル**:これは必須ではありません。データ処理機能のみを使用する場合、または独自の下流タスクプログラムを持っている場合は、このフォルダ内のすべてを無視できます!
|
||||
|
||||
## 主な機能
|
||||
|
||||
1. 毎日のインサイト表示
|
||||
|
||||
2. 毎日の記事表示
|
||||
|
||||
3. 特定のホットトピックの追加検索(Sogouエンジンを使用)
|
||||
|
||||
4. 特定のホットトピックのWordレポートの生成
|
||||
|
||||
**注意:ここにあるコードは直接使用できません。古いバージョンのバックエンドに適合しています。`core`フォルダ内の最新のバックエンドコードを調べ、特にデータベースとの連携部分について変更を行う必要があります!**
|
||||
|
||||
-----------------------------------------------------------------
|
||||
|
||||
**Exemple de tableau de bord Web inclus** : Ceci est facultatif. Si vous n'utilisez que les fonctions de traitement des données ou si vous avez votre propre programme de tâches en aval, vous pouvez ignorer tout ce qui se trouve dans ce dossier !
|
||||
|
||||
## Fonctions principales
|
||||
|
||||
1. Affichage des insights quotidiens
|
||||
|
||||
2. Affichage des articles quotidiens
|
||||
|
||||
3. Recherche supplémentaire pour des sujets populaires spécifiques (en utilisant le moteur Sogou)
|
||||
|
||||
4. Génération de rapports Word pour des sujets populaires spécifiques
|
||||
|
||||
**Remarque : Le code ici ne peut pas être utilisé directement. Il est adapté à une version plus ancienne du backend. Vous devez étudier le code backend le plus récent dans le dossier `core` et apporter des modifications, en particulier dans les parties relatives à l'intégration de la base de données !**
|
||||
|
||||
-----------------------------------------------------------------
|
||||
|
||||
**Beispiel eines enthaltenen Web-Dashboards**: Dies ist optional. Wenn Sie nur die Datenverarbeitungsfunktionen verwenden oder Ihr eigenes Downstream-Aufgabenprogramm haben, können Sie alles in diesem Ordner ignorieren!
|
||||
|
||||
## Hauptfunktionen
|
||||
|
||||
1. Tägliche Einblicke anzeigen
|
||||
|
||||
2. Tägliche Artikel anzeigen
|
||||
|
||||
3. Angehängte Suche nach spezifischen Hot Topics (unter Verwendung der Sogou-Suchmaschine)
|
||||
|
||||
4. Erstellen von Word-Berichten für spezifische Hot Topics
|
||||
|
||||
**Hinweis: Der Code hier kann nicht direkt verwendet werden. Er ist an eine ältere Version des Backends angepasst. Sie müssen den neuesten Backend-Code im `core`-Ordner studieren und Änderungen vornehmen, insbesondere in den Teilen, die die Datenbankintegration betreffen!**
|
@ -2,7 +2,6 @@ import os
|
||||
import time
|
||||
import json
|
||||
import uuid
|
||||
from pb_api import PbTalker
|
||||
from get_report import get_report, logger, pb
|
||||
from get_search import search_insight
|
||||
from tranlsation_volcengine import text_translate
|
||||
@ -22,12 +21,6 @@ class BackendService:
|
||||
logger.info('backend service init success.')
|
||||
|
||||
def report(self, insight_id: str, topics: list[str], comment: str) -> dict:
|
||||
"""
|
||||
:param insight_id: insight在pb中的id
|
||||
:param topics: 书写报告的主题和大纲,必传,第一个值是标题,后面是段落标题,可以传空列表,AI就自由发挥
|
||||
:param comment: 修改意见,可以传‘’
|
||||
:return: 成功的话返回更新后的insight_id(其实跟原id一样), 不成功返回空字符
|
||||
"""
|
||||
logger.debug(f'got new report request insight_id {insight_id}')
|
||||
insight = pb.read('insights', filter=f'id="{insight_id}"')
|
||||
if not insight:
|
||||
@ -48,8 +41,6 @@ class BackendService:
|
||||
return self.build_out(-2, f'{insight_id} has no valid articles')
|
||||
|
||||
content = insight[0]['content']
|
||||
# 这里把所有相关文章的content都要翻译成中文了,分析使用中文,因为涉及到部分专有词汇维护在火山的账户词典上,大模型并不了解
|
||||
# 发现翻译为中文后,引发灵积模型敏感词检测概率增加了,暂时放弃……
|
||||
if insight_id in self.memory:
|
||||
memory = self.memory[insight_id]
|
||||
else:
|
||||
@ -78,11 +69,7 @@ class BackendService:
|
||||
|
||||
def translate(self, article_ids: list[str]) -> dict:
|
||||
"""
|
||||
:param article_ids: 待翻译的文章id列表
|
||||
:return: 成功的话flag 11。负数为报错,但依然可能部分任务完成,可以稍后再次调用
|
||||
返回中的msg记录了可能的错误
|
||||
这个函数的作用是遍历列表中的id, 如果对应article——id中没有translation_result,则触发翻译,并更新article——id记录
|
||||
执行本函数后,如果收到flag 11,则可以再次从pb中请求article-id对应的translation_result
|
||||
just for chinese users
|
||||
"""
|
||||
logger.debug(f'got new translate task {article_ids}')
|
||||
flag = 11
|
||||
@ -155,10 +142,6 @@ class BackendService:
|
||||
return self.build_out(flag, msg)
|
||||
|
||||
def more_search(self, insight_id: str) -> dict:
|
||||
"""
|
||||
:param insight_id: insight在pb中的id
|
||||
:return: 成功的话返回更新后的insight_id(其实跟原id一样), 不成功返回空字符
|
||||
"""
|
||||
logger.debug(f'got search request for insight: {insight_id}')
|
||||
insight = pb.read('insights', filter=f'id="{insight_id}"')
|
||||
if not insight:
|
65
dashboard/general_utils.py
Normal file
65
dashboard/general_utils.py
Normal file
@ -0,0 +1,65 @@
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
def isURL(string):
|
||||
result = urlparse(string)
|
||||
return result.scheme != '' and result.netloc != ''
|
||||
|
||||
|
||||
def isChinesePunctuation(char):
|
||||
# 定义中文标点符号的Unicode编码范围
|
||||
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
|
||||
# 检查字符是否在上述范围内
|
||||
return ord(char) in chinese_punctuations
|
||||
|
||||
|
||||
def is_chinese(string):
|
||||
"""
|
||||
使用火山引擎其实可以支持更加广泛的语言检测,未来可以考虑 https://www.volcengine.com/docs/4640/65066
|
||||
判断字符串中大部分是否是中文
|
||||
:param string: {str} 需要检测的字符串
|
||||
:return: {bool} 如果大部分是中文返回True,否则返回False
|
||||
"""
|
||||
pattern = re.compile(r'[^\u4e00-\u9fa5]')
|
||||
non_chinese_count = len(pattern.findall(string))
|
||||
# It is easy to misjudge strictly according to the number of bytes less than half. English words account for a large number of bytes, and there are punctuation marks, etc
|
||||
return (non_chinese_count/len(string)) < 0.68
|
||||
|
||||
|
||||
def extract_and_convert_dates(input_string):
|
||||
# Define regular expressions that match different date formats
|
||||
patterns = [
|
||||
r'(\d{4})-(\d{2})-(\d{2})', # YYYY-MM-DD
|
||||
r'(\d{4})/(\d{2})/(\d{2})', # YYYY/MM/DD
|
||||
r'(\d{4})\.(\d{2})\.(\d{2})', # YYYY.MM.DD
|
||||
r'(\d{4})\\(\d{2})\\(\d{2})', # YYYY\MM\DD
|
||||
r'(\d{4})(\d{2})(\d{2})' # YYYYMMDD
|
||||
]
|
||||
|
||||
matches = []
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, input_string)
|
||||
if matches:
|
||||
break
|
||||
if matches:
|
||||
return ''.join(matches[0])
|
||||
return None
|
||||
|
||||
|
||||
def get_logger_level() -> str:
|
||||
level_map = {
|
||||
'silly': 'CRITICAL',
|
||||
'verbose': 'DEBUG',
|
||||
'info': 'INFO',
|
||||
'warn': 'WARNING',
|
||||
'error': 'ERROR',
|
||||
}
|
||||
level: str = os.environ.get('WS_LOG', 'info').lower()
|
||||
if level not in level_map:
|
||||
raise ValueError(
|
||||
'WiseFlow LOG should support the values of `silly`, '
|
||||
'`verbose`, `info`, `warn`, `error`'
|
||||
)
|
||||
return level_map.get(level, 'info')
|
@ -1,7 +1,7 @@
|
||||
import random
|
||||
import re
|
||||
import os
|
||||
from llms.dashscope_wrapper import dashscope_llm
|
||||
from backend.llms.dashscope_wrapper import dashscope_llm
|
||||
from docx import Document
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Pt, RGBColor
|
@ -1,31 +1,21 @@
|
||||
from scrapers.simple_crawler import simple_crawler
|
||||
from .simple_crawler import simple_crawler
|
||||
from .mp_crawler import mp_crawler
|
||||
from typing import Union
|
||||
from pathlib import Path
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
from urllib.parse import quote, urlparse
|
||||
from urllib.parse import quote
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
|
||||
# 国内的应用场景,sogou搜索应该不错了,还支持weixin、百科搜索
|
||||
# 海外的应用场景可以考虑使用duckduckgo或者google_search的sdk
|
||||
# 尽量还是不要自己host一个搜索引擎吧,虽然有类似https://github.com/StractOrg/stract/tree/main的开源方案,但毕竟这是两套工程
|
||||
def search_insight(keyword: str, logger, exist_urls: list[Union[str, Path]], knowledge: bool = False) -> (int, list):
|
||||
"""
|
||||
搜索网页
|
||||
:param keyword: 要搜索的主题
|
||||
:param exist_urls: 已经存在的url列表,即这些网页已经存在,搜索结果中如果出现则跳过
|
||||
:param knowledge: 是否搜索知识
|
||||
:param logger: 日志
|
||||
:return: 返回文章信息列表list[dict]和flag,负数为报错,0为没有结果,11为成功
|
||||
"""
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
||||
}
|
||||
# 如果knowledge参数为真,则意味着搜索概念知识,这时只会搜索sogou百科
|
||||
# 默认是搜索新闻资讯,同时搜索sogou网页和资讯
|
||||
# If the knowledge parameter is true, it means searching for conceptual knowledge, then only sogou encyclopedia will be searched
|
||||
# The default is to search for news information, and search for sogou pages and information at the same time
|
||||
if knowledge:
|
||||
url = f"https://www.sogou.com/sogou?query={keyword}&insite=baike.sogou.com"
|
||||
else:
|
||||
@ -74,24 +64,24 @@ def search_insight(keyword: str, logger, exist_urls: list[Union[str, Path]], kno
|
||||
if not relist:
|
||||
return -7, []
|
||||
|
||||
# 这里仅使用simple_crawler, 因为search行为要快
|
||||
results = []
|
||||
for url in relist:
|
||||
if url in exist_urls:
|
||||
continue
|
||||
exist_urls.append(url)
|
||||
flag, value = simple_crawler(url, logger)
|
||||
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
|
||||
flag, article = mp_crawler(url, logger)
|
||||
if flag == -7:
|
||||
logger.info(f"fetch {url} failed, try to wait 1min and try again")
|
||||
time.sleep(60)
|
||||
flag, article = mp_crawler(url, logger)
|
||||
else:
|
||||
flag, article = simple_crawler(url, logger)
|
||||
|
||||
if flag != 11:
|
||||
continue
|
||||
from_site = urlparse(url).netloc
|
||||
if from_site.startswith('www.'):
|
||||
from_site = from_site.replace('www.', '')
|
||||
from_site = from_site.split('.')[0]
|
||||
if value['abstract']:
|
||||
value['abstract'] = f"({from_site} 报道){value['abstract']}"
|
||||
value['content'] = f"({from_site} 报道){value['content']}"
|
||||
value['images'] = json.dumps(value['images'])
|
||||
results.append(value)
|
||||
|
||||
results.append(article)
|
||||
|
||||
if results:
|
||||
return 11, results
|
||||
@ -102,7 +92,7 @@ def redirect_url(url):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
|
||||
}
|
||||
r = requests.get(url, headers=headers, allow_redirects=False) # 不允许重定向
|
||||
r = requests.get(url, headers=headers, allow_redirects=False)
|
||||
if r.status_code == 302:
|
||||
real_url = r.headers.get('Location')
|
||||
else:
|
@ -1,4 +1,3 @@
|
||||
# 这是后端服务的fastapi框架程序
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from __init__ import BackendService
|
||||
@ -17,13 +16,13 @@ class TranslateRequest(BaseModel):
|
||||
|
||||
class ReportRequest(BaseModel):
|
||||
insight_id: str
|
||||
toc: list[str] = [""] # 第一个元素为大标题,其余为段落标题。第一个元素必须存在,可以是空字符,llm会自动拟标题。
|
||||
toc: list[str] = [""] # The first element is a headline, and the rest are paragraph headings. The first element must exist, can be a null character, and llm will automatically make headings.
|
||||
comment: str = ""
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="首席情报官 Backend Server",
|
||||
description="From DSW Team.",
|
||||
title="wiseflow Backend Server",
|
||||
description="From WiseFlow Team.",
|
||||
version="0.2",
|
||||
openapi_url="/openapi.json"
|
||||
)
|
||||
@ -36,13 +35,12 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 如果有多个后端服务,可以在这里定义多个后端服务的实例
|
||||
bs = BackendService()
|
||||
|
||||
|
||||
@app.get("/")
|
||||
def read_root():
|
||||
msg = "Hello, 欢迎使用首席情报官 Backend."
|
||||
msg = "Hello, This is WiseFlow Backend."
|
||||
return {"msg": msg}
|
||||
|
||||
|
109
dashboard/mp_crawler.py
Normal file
109
dashboard/mp_crawler.py
Normal file
@ -0,0 +1,109 @@
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
|
||||
header = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||
|
||||
|
||||
def mp_crawler(url: str, logger) -> (int, dict):
|
||||
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
||||
logger.warning(f'{url} is not a mp url, you should not use this function')
|
||||
return -5, {}
|
||||
|
||||
url = url.replace("http://", "https://", 1)
|
||||
|
||||
try:
|
||||
with httpx.Client() as client:
|
||||
response = client.get(url, headers=header, timeout=30)
|
||||
except Exception as e:
|
||||
logger.warning(f"cannot get content from {url}\n{e}")
|
||||
return -7, {}
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get the original release date first
|
||||
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
||||
match = re.search(pattern, response.text)
|
||||
|
||||
if match:
|
||||
date_only = match.group(1)
|
||||
publish_time = date_only.replace('-', '')
|
||||
else:
|
||||
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
|
||||
|
||||
# Get description content from < meta > tag
|
||||
try:
|
||||
meta_description = soup.find('meta', attrs={'name': 'description'})
|
||||
summary = meta_description['content'].strip() if meta_description else ''
|
||||
card_info = soup.find('div', id='img-content')
|
||||
# Parse the required content from the < div > tag
|
||||
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
|
||||
if soup.find('h1', id='activity-name') \
|
||||
else soup.find('h1', class_='rich_media_title').text.strip()
|
||||
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
|
||||
if card_info \
|
||||
else soup.find('div', class_='wx_follow_nickname').text.strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"not mp format: {url}\n{e}")
|
||||
return -7, {}
|
||||
|
||||
if not rich_media_title or not profile_nickname:
|
||||
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
||||
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
||||
return -7, {}
|
||||
|
||||
# Parse text and image links within the content interval
|
||||
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
|
||||
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
|
||||
# (but the proportion of this type of article is not high).
|
||||
texts = []
|
||||
images = set()
|
||||
content_area = soup.find('div', id='js_content')
|
||||
if content_area:
|
||||
# 提取文本
|
||||
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
|
||||
text = section.get_text(separator=' ', strip=True)
|
||||
if text and text not in texts:
|
||||
texts.append(text)
|
||||
|
||||
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
|
||||
img_src = img.get('data-src') or img.get('src')
|
||||
if img_src:
|
||||
images.add(img_src)
|
||||
cleaned_texts = [t for t in texts if t.strip()]
|
||||
content = '\n'.join(cleaned_texts)
|
||||
else:
|
||||
logger.warning(f"failed to analysis contents {url}")
|
||||
return 0, {}
|
||||
if content:
|
||||
content = f"({profile_nickname} 文章){content}"
|
||||
else:
|
||||
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
|
||||
# At this time, you can use the summary as the content.
|
||||
content = f"({profile_nickname} 文章){summary}"
|
||||
|
||||
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
twitter_image = soup.find('meta', property='twitter:image')
|
||||
if og_image:
|
||||
images.add(og_image['content'])
|
||||
if twitter_image:
|
||||
images.add(twitter_image['content'])
|
||||
|
||||
if rich_media_title == summary or not summary:
|
||||
abstract = ''
|
||||
else:
|
||||
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
|
||||
|
||||
return 11, {
|
||||
'title': rich_media_title,
|
||||
'author': profile_nickname,
|
||||
'publish_time': publish_time,
|
||||
'abstract': abstract,
|
||||
'content': content,
|
||||
'images': list(images),
|
||||
'url': url,
|
||||
}
|
60
dashboard/simple_crawler.py
Normal file
60
dashboard/simple_crawler.py
Normal file
@ -0,0 +1,60 @@
|
||||
from gne import GeneralNewsExtractor
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from utils.general_utils import extract_and_convert_dates
|
||||
import chardet
|
||||
|
||||
|
||||
extractor = GeneralNewsExtractor()
|
||||
header = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||
|
||||
|
||||
def simple_crawler(url: str | Path, logger) -> (int, dict):
|
||||
"""
|
||||
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
|
||||
"""
|
||||
try:
|
||||
with httpx.Client() as client:
|
||||
response = client.get(url, headers=header, timeout=30)
|
||||
rawdata = response.content
|
||||
encoding = chardet.detect(rawdata)['encoding']
|
||||
text = rawdata.decode(encoding)
|
||||
result = extractor.extract(text)
|
||||
except Exception as e:
|
||||
logger.warning(f"cannot get content from {url}\n{e}")
|
||||
return -7, {}
|
||||
|
||||
if not result:
|
||||
logger.error(f"gne cannot extract {url}")
|
||||
return 0, {}
|
||||
|
||||
if len(result['title']) < 4 or len(result['content']) < 24:
|
||||
logger.info(f"{result} not valid")
|
||||
return 0, {}
|
||||
|
||||
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
|
||||
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
|
||||
logger.warning(f"can not get {url} from the Internet")
|
||||
return -7, {}
|
||||
|
||||
date_str = extract_and_convert_dates(result['publish_time'])
|
||||
if date_str:
|
||||
result['publish_time'] = date_str
|
||||
else:
|
||||
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
try:
|
||||
meta_description = soup.find("meta", {"name": "description"})
|
||||
if meta_description:
|
||||
result['abstract'] = meta_description["content"].strip()
|
||||
else:
|
||||
result['abstract'] = ''
|
||||
except Exception:
|
||||
result['abstract'] = ''
|
||||
|
||||
result['url'] = str(url)
|
||||
return 11, result
|
@ -1,10 +1,11 @@
|
||||
# 使用火山引擎进行翻译的接口封装
|
||||
# 通过环境变量设置VOLC_KEY,格式为AK|SK
|
||||
# AK-SK 需要手机号注册并实名认证,具体见这里https://console.volcengine.com/iam/keymanage/ (自助接入)
|
||||
# 费用:每月免费额度200万字符(1个汉字、1个外语字母、1个数字、1个符号或空格都计为一个字符),超出后49元/每百万字符
|
||||
# 图片翻译:每月免费100张,超出后0.04元/张
|
||||
# 文本翻译并发限制,每个batch最多16个,总文本长度不超过5000字符,max QPS为10
|
||||
# 术语库管理:https://console.volcengine.com/translate
|
||||
# Interface encapsulation for translation using Volcano Engine
|
||||
# Set VOLC_KEY by environment variables in the format AK | SK
|
||||
# AK-SK requires mobile phone number registration and real-name authentication, see here https://console.volcengine.com/iam/keymanage/(self-service access)
|
||||
# Cost: Monthly free limit 2 million characters (1 Chinese character, 1 foreign language letter, 1 number, 1 symbol or space are counted as one character),
|
||||
# exceeding 49 yuan/per million characters
|
||||
# Picture translation: 100 pieces per month for free, 0.04 yuan/piece after exceeding
|
||||
# Text translation concurrency limit, up to 16 per batch, the total text length does not exceed 5000 characters, max QPS is 10
|
||||
# Terminology database management: https://console.volcengine.com/translate
|
||||
|
||||
|
||||
import json
|
||||
@ -18,7 +19,7 @@ from volcengine.base.Service import Service
|
||||
|
||||
VOLC_KEY = os.environ.get('VOLC_KEY', None)
|
||||
if not VOLC_KEY:
|
||||
raise Exception('请设置环境变量 VOLC_KEY,格式为AK|SK')
|
||||
raise Exception('Please set environment variables VOLC_KEY format as AK | SK')
|
||||
|
||||
k_access_key, k_secret_key = VOLC_KEY.split('|')
|
||||
|
Before Width: | Height: | Size: 1.5 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 4.0 KiB After Width: | Height: | Size: 4.0 KiB |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user