web dashboard

This commit is contained in:
bigbrother666 2024-06-13 21:08:58 +08:00
parent 6805be7d14
commit fd633ae54d
116 changed files with 2597 additions and 996 deletions

4
.gitignore vendored
View File

@ -4,3 +4,7 @@
.DS_Store
.idea/
__pycache__
.env
.venv/
core/pb/pb_data/
core/WStest/

Binary file not shown.

Before

Width:  |  Height:  |  Size: 64 KiB

4
client/.gitignore vendored
View File

@ -1,4 +0,0 @@
.env
.venv/
pb/pb_data/
backend/WStest/

View File

@ -1,19 +0,0 @@
FROM python:3.10-slim
RUN apt-get update && \
apt-get install -yq tzdata build-essential
RUN ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
WORKDIR /app
COPY backend/requirements.txt requirements.txt
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
COPY backend .
EXPOSE 7777
CMD tail -f /dev/null
# ENTRYPOINT ["bash", "docker_entrypoint.sh"]

View File

@ -1,35 +0,0 @@
FROM node:20-slim as builder
WORKDIR /app
COPY web ./
RUN npm install -g pnpm
RUN pnpm install
RUN pnpm build
FROM alpine:latest
ARG PB_VERSION=0.21.1
RUN apk add --no-cache unzip ca-certificates tzdata && \
ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
# download and unzip PocketBase
ADD https://github.com/pocketbase/pocketbase/releases/download/v${PB_VERSION}/pocketbase_${PB_VERSION}_linux_amd64.zip /tmp/pb.zip
RUN unzip /tmp/pb.zip -d /pb/
RUN mkdir -p /pb
COPY ./pb/pb_migrations /pb/pb_migrations
COPY ./pb/pb_hooks /pb/pb_hooks
COPY --from=builder /app/dist /pb/pb_public
WORKDIR /pb
EXPOSE 8090
CMD tail -f /dev/null
# CMD ["/pb/pocketbase", "serve", "--http=0.0.0.0:8090"]

View File

@ -1,86 +0,0 @@
# WiseFlow Client Backend
# for developers
## 部署
1、建议创建新环境 **python版本为3.10**
2、 安装requirements.txt
## 单独启动数据库(需要先下载对应平台的pocketbase或者单独build web docker并启动)
pocketbase [下载地址](https://pocketbase.io/docs/)
文件放入 backend/pb 目录下
```bash
chmod +x pocketbase
./pocketbase serve
```
之后将pb的服务地址配置为环境变量PB_API_BASE
如果选择使用docker可以参考client文件夹下的docker files
pb目录下的pb_migrations文件夹保持与repo同步数据库会自动创建本项目需要的表单如果不一致可能导致后面运行失败
pb_data是数据库数据存放目录如果更改了admin的密码记得修改.env
## 脚本文件说明
- tasks.sh #这是启动定时任务的脚本 (本地纯调试后端,这个不启动也行)
- backend.sh #这是启动后端服务的脚本,(默认使用 localhost:7777 通过 http://localhost:7777/docs/ 查看接口详情)
备注backend 服务返回格式统一为 dict`{"flag": int, "result": [{"type": "text", "content": "xxx"}]}`
统一返回flag约定
| flag 码 | 内容 |
|--------|-----------------|
| -11 | LLM 错误/异常 |
| -7 | 网络请求失败 |
| -6 | 翻译接口失败 |
| -5 | 入参格式错误 |
| -4 | 向量模型错误 |
| -3 | (预留) |
| -2 | pb数据库接口失败 |
| -1 | 未知错误 |
| 0 | 正常返回 |
| 1 | (预留) |
| 2 | (预留) |
| 3 | (预留) |
| 11 | 用户所处流程正常结束 |
| 21 | 生成了新的文件 |
注: 1、提交后端request status 200 只意味着提交成功,不表示后端完全处理成功,**收到flag 11才表示流程正常结束**,所有运算成功。
2、flag 0 通常意味着虽然所有运算都执行了,但没有任何结果,即没有新文件产生,也没有新的数据提交给数据库。
3、另外对于translation接口由于是批量处理存在着部分成功翻译结果提交数据库并做了关联部分失败的情况所以即便是没有收到flag11也建议重新从pb读一遍数据
## 目录结构
```
backend
├── llms # 大模型的wraper
├── scrapers # 爬虫库
|—— __init__.py #如果要添加具体网站的专有爬虫需要把爬虫脚本放在这个文件的同级目录同时编辑这面的scraper_map字典
|—— general_scraper.py #通用网页解析器
|—— simple_crawler.py #基于gne的快速单一页面解析器
|—— __init__.py # backend主函数
├── background_task.py # 后台任务主程序,如果要定义多个后台任务,请编辑这个文件
├── main.py # 后端服务主程序fastapi框架
├── tasks.sh # 后台服务启动脚本
|—— backend.sh # 后端服务启动脚本`
├── embedding.py # embedding模型服务
├── pb_api.py # pb数据库接口
├── general_utils.py # 工具库
├── get_insight.py # 线索分析与提炼模块
├── get_logger.py # logger配置
├── get_report.py # 报告生成模块
├── get_search.py # 基于sogu的搜索实现
├── work_process.py # 后台服务主流程(抓取与提炼)
├── tranlsation_volcengine.py # 基于火山引擎api的翻译模块
```

View File

@ -1,41 +0,0 @@
"""
通过编辑这个脚本可以自定义需要的后台任务
"""
import schedule
import time
from get_insight import pb, logger
from work_process import ServiceProcesser
sp = ServiceProcesser()
counter = 0
# 每小时唤醒一次如果pb的sites表中有信源会挑取符合周期的信源执行没有没有的话则每24小时执行专有爬虫一次
def task():
global counter
sites = pb.read('sites', filter='activated=True')
urls = []
for site in sites:
if not site['per_hours'] or not site['url']:
continue
if counter % site['per_hours'] == 0:
urls.append(site['url'])
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
logger.info(urls)
if urls:
sp(sites=urls)
else:
if counter % 24 == 0:
sp()
else:
print('\033[0;33mno work for this loop\033[0m')
counter += 1
schedule.every().hour.at(":38").do(task)
task()
while True:
schedule.run_pending()
time.sleep(60)

View File

@ -1,5 +0,0 @@
#!/bin/bash
set -o allexport
set +o allexport
exec uvicorn main:app --reload --host 0.0.0.0 --port 7777 &
exec python background_task.py

View File

@ -1,17 +0,0 @@
from BCEmbedding.tools.langchain import BCERerank
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
embedding_model_name = os.environ.get('EMBEDDING_MODEL_PATH', "")
rerank_model_name = os.environ.get('RERANKER_MODEL_PATH', "")
if not embedding_model_name or not rerank_model_name:
raise Exception("请设置 EMBEDDING_MODEL_PATH 和 RERANKER_MODEL_PATH")
device = os.environ.get('DEVICE', 'cpu')
embedding_model_kwargs = {'device': device}
embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True, 'show_progress_bar': False}
reranker_args = {'model': rerank_model_name, 'top_n': 5, 'device': device}
embed_model = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=embedding_model_kwargs, encode_kwargs=embedding_encode_kwargs)
reranker = BCERerank(**reranker_args)

View File

@ -1,286 +0,0 @@
from embeddings import embed_model, reranker
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import ContextualCompressionRetriever
from llms.dashscope_wrapper import dashscope_llm
from general_utils import isChinesePunctuation, is_chinese
from tranlsation_volcengine import text_translate
import time
import re
import os
from general_utils import get_logger_level
from loguru import logger
from pb_api import PbTalker
project_dir = os.environ.get("PROJECT_DIR", "")
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'scanning_task.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
max_tokens = 4000
relation_theshold = 0.525
role_config = pb.read(collection_name='roleplays', filter=f'activated=True')
_role_config_id = ''
if role_config:
character = role_config[0]['character']
focus = role_config[0]['focus']
focus_type = role_config[0]['focus_type']
good_sample1 = role_config[0]['good_sample1']
good_sample2 = role_config[0]['good_sample2']
bad_sample = role_config[0]['bad_sample']
_role_config_id = role_config[0]['id']
else:
character, good_sample1, focus, focus_type, good_sample2, bad_sample = '', '', '', '', '', ''
if not character:
character = input('\033[0;32m 请为首席情报官指定角色设定eg. 来自中国的网络安全情报专家):\033[0m\n')
_role_config_id = pb.add(collection_name='roleplays', body={'character': character, 'activated': True})
if not _role_config_id:
raise Exception('pls check pb data, 无法获取角色设定')
if not (focus and focus_type and good_sample1 and good_sample2 and bad_sample):
focus = input('\033[0;32m 请为首席情报官指定关注点eg. 中国关注的网络安全新闻):\033[0m\n')
focus_type = input('\033[0;32m 请为首席情报官指定关注点类型eg. 网络安全新闻):\033[0m\n')
good_sample1 = input('\033[0;32m 请给出一个你期望的情报描述示例eg. 黑客组织Rhysida声称已入侵中国国有能源公司: \033[0m\n')
good_sample2 = input('\033[0;32m 请再给出一个理想示例eg. 差不多一百万份包含未成年人数据(包括家庭地址和照片)的文件对互联网上的任何人都开放,对孩子构成威胁): \033[0m\n')
bad_sample = input('\033[0;32m 请给出一个你不期望的情报描述示例eg. 黑客组织活动最近频发): \033[0m\n')
_ = pb.update(collection_name='roleplays', id=_role_config_id, body={'focus': focus, 'focus_type': focus_type, 'good_sample1': good_sample1, 'good_sample2': good_sample2, 'bad_sample': bad_sample})
# 实践证明如果强调让llm挖掘我国值得关注的线索则挖掘效果不好容易被新闻内容误导错把别的国家当成我国可能这时新闻内有我国这样的表述
# step by step 如果是内心独白方式输出格式包含两种难度增加了qwen-max不能很好的适应也许可以改成两步第一步先输出线索列表第二步再会去找对应的新闻编号
# 但从实践来看,这样做的性价比并不高,且会引入新的不确定性。
_first_stage_prompt = f'''你是一名{character}你将被给到一个新闻列表新闻文章用XML标签分隔。请对此进行分析挖掘出特别值得{focus}线索。你给出的线索应该足够具体,而不是同类型新闻的归类描述,好的例子如:
"""{good_sample1}"""
不好的例子如
"""{bad_sample}"""
请从头到尾仔细阅读每一条新闻的内容不要遗漏然后列出值得关注的线索每条线索都用一句话进行描述最终按一条一行的格式输出并整体用三引号包裹如下所示
"""
{good_sample1}
{good_sample2}
"""
不管新闻列表是何种语言请仅用中文输出分析结果'''
_rewrite_insight_prompt = f'''你是一名{character}你将被给到一个新闻列表新闻文章用XML标签分隔。请对此进行分析从中挖掘出一条最值得关注的{focus_type}线索。你给出的线索应该足够具体,而不是同类型新闻的归类描述,好的例子如:
"""{good_sample1}"""
不好的例子如
"""{bad_sample}"""
请保证只输出一条最值得关注的线索线索请用一句话描述并用三引号包裹输出如下所示
"""{good_sample1}"""
不管新闻列表是何种语言请仅用中文输出分析结果'''
def _parse_insight(article_text: str, cache: dict) -> (bool, dict):
input_length = len(cache)
result = dashscope_llm([{'role': 'system', 'content': _first_stage_prompt}, {'role': 'user', 'content': article_text}],
'qwen1.5-72b-chat', logger=logger)
if result:
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
result = pattern.findall(result)
else:
logger.warning('1st-stage llm generate failed: no result')
if result:
try:
results = result[0].split('\n')
results = [_.strip() for _ in results if _.strip()]
to_del = []
to_add = []
for element in results:
if "" in element:
to_del.append(element)
to_add.extend(element.split(''))
for element in to_del:
results.remove(element)
results.extend(to_add)
results = list(set(results))
for text in results:
logger.debug(f'parse result: {text}')
# qwen-72b-chat 特例
# potential_insight = re.sub(r'编号[^]*', '', text)
potential_insight = text.strip()
if len(potential_insight) < 2:
logger.debug(f'parse failed: not enough potential_insight: {potential_insight}')
continue
if isChinesePunctuation(potential_insight[-1]):
potential_insight = potential_insight[:-1]
if potential_insight in cache:
continue
else:
cache[potential_insight] = []
except Exception as e:
logger.debug(f'parse failed: {e}')
output_length = len(cache)
if input_length == output_length:
return True, cache
return False, cache
def _rewrite_insight(context: str) -> (bool, str):
result = dashscope_llm([{'role': 'system', 'content': _rewrite_insight_prompt}, {'role': 'user', 'content': context}],
'qwen1.5-72b-chat', logger=logger)
if result:
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
result = pattern.findall(result)
else:
logger.warning(f'insight rewrite process llm generate failed: no result')
if not result:
return True, ''
try:
results = result[0].split('\n')
text = results[0].strip()
logger.debug(f'parse result: {text}')
if len(text) < 2:
logger.debug(f'parse failed: not enough potential_insight: {text}')
return True, ''
if isChinesePunctuation(text[-1]):
text = text[:-1]
except Exception as e:
logger.debug(f'parse failed: {e}')
return True, ''
return False, text
def get_insight(articles: dict, titles: dict) -> list:
context = ''
cache = {}
for value in articles.values():
if value['abstract']:
text = value['abstract']
else:
if value['title']:
text = value['title']
else:
if value['content']:
text = value['content']
else:
continue
# 这里不使用long context是因为阿里灵积经常检查出输入敏感词但又不给敏感词反馈对应批次只能放弃用long context风险太大
# 另外long context中间部分llm可能会遗漏
context += f"<article>{text}</article>\n"
if len(context) < max_tokens:
continue
flag, cache = _parse_insight(context, cache)
if flag:
logger.warning(f'following articles may not be completely analyzed: \n{context}')
context = ''
# 据说频繁调用会引发性能下降每次调用后休息1s。现在轮替调用qwen-72b和max所以不必了。
time.sleep(1)
if context:
flag, cache = _parse_insight(context, cache)
if flag:
logger.warning(f'following articles may not be completely analyzed: \n{context}')
if not cache:
logger.warning('no insights found')
return []
# second stage: 匹配insights和article_titles
title_list = [Document(page_content=key, metadata={}) for key, value in titles.items()]
retriever = FAISS.from_documents(title_list, embed_model,
distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT).as_retriever(search_type="similarity",
search_kwargs={"score_threshold": relation_theshold, "k": 10})
compression = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
for key in cache.keys():
logger.debug(f'searching related articles for insight: {key}')
rerank_results = compression.get_relevant_documents(key)
for i in range(len(rerank_results)):
if rerank_results[i].metadata['relevance_score'] < relation_theshold:
break
cache[key].append(titles[rerank_results[i].page_content])
if titles[rerank_results[i].page_content] not in articles:
articles[titles[rerank_results[i].page_content]] = {'title': rerank_results[i].page_content}
logger.info(f'{key} - {cache[key]}')
# third stage对于对应文章重叠率超过25%的合并然后对于有多个文章的再次使用llm生成insight
# 因为实践中发现第一次insight召回的文章标题可能都很相关但是汇总起来却指向另一个角度的insight
def calculate_overlap(list1, list2):
# 计算两个列表的交集长度
intersection_length = len(set(list1).intersection(set(list2)))
# 计算重合率
overlap_rate = intersection_length / min(len(list1), len(list2))
return overlap_rate >= 0.75
merged_dict = {}
for key, value in cache.items():
if not value:
continue
merged = False
for existing_key, existing_value in merged_dict.items():
if calculate_overlap(value, existing_value):
merged_dict[existing_key].extend(value)
merged = True
break
if not merged:
merged_dict[key] = value
cache = {}
for key, value in merged_dict.items():
value = list(set(value))
if len(value) > 1:
context = ''
for _id in value:
context += f"<article>{articles[_id]['title']}</article>\n"
if len(context) >= max_tokens:
break
if not context:
continue
flag, new_insight = _rewrite_insight(context)
if flag:
logger.warning(f'insight {key} may contain wrong')
cache[key] = value
else:
if cache:
title_list = [Document(page_content=key, metadata={}) for key, value in cache.items()]
retriever = FAISS.from_documents(title_list, embed_model,
distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT).as_retriever(
search_type="similarity",
search_kwargs={"score_threshold": 0.85, "k": 1})
compression = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=retriever)
rerank_results = compression.get_relevant_documents(new_insight)
if rerank_results and rerank_results[0].metadata['relevance_score'] > 0.85:
logger.debug(f"{new_insight} is too similar to {rerank_results[0].page_content}, merging")
cache[rerank_results[0].page_content].extend(value)
cache[rerank_results[0].page_content] = list(set(cache[rerank_results[0].page_content]))
else:
cache[new_insight] = value
else:
cache[new_insight] = value
else:
cache[key] = value
# 排序对应articles越多的越靠前
# sorted_cache = sorted(cache.items(), key=lambda x: len(x[1]), reverse=True)
logger.info('re-ranking ressult:')
new_cache = []
for key, value in cache.items():
if not is_chinese(key):
translate_text = text_translate([key], target_language='zh', logger=logger)
if translate_text:
key = translate_text[0]
logger.info(f'{key} - {value}')
new_cache.append({'content': key, 'articles': value})
return new_cache

View File

@ -1,93 +0,0 @@
# 使用aliyun dashscope的api封装
# 非流式接口
# 为了兼容性输入输出都使用message格式与openai SDK格式一致
import time
from http import HTTPStatus
import dashscope
import random
import os
DASHSCOPE_KEY = os.getenv("LLM_API_KEY")
if not DASHSCOPE_KEY:
raise ValueError("请指定LLM_API_KEY的环境变量")
dashscope.api_key = DASHSCOPE_KEY
def dashscope_llm(messages: list, model: str, logger=None, **kwargs) -> str:
if logger:
logger.debug(f'messages:\n {messages}')
logger.debug(f'model: {model}')
logger.debug(f'kwargs:\n {kwargs}')
response = dashscope.Generation.call(
messages=messages,
model=model,
result_format='message', # set the result to be "message" format.
**kwargs
)
for i in range(2):
if response.status_code == HTTPStatus.OK:
break
if response.message == "Input data may contain inappropriate content.":
break
if logger:
logger.warning(f"request failed. code: {response.code}, message:{response.message}\nretrying...")
else:
print(f"request failed. code: {response.code}, message:{response.message}\nretrying...")
time.sleep(1 + i*30)
kwargs['seed'] = random.randint(1, 10000)
response = dashscope.Generation.call(
messages=messages,
model=model,
result_format='message', # set the result to be "message" format.
**kwargs
)
if response.status_code != HTTPStatus.OK:
if logger:
logger.warning(f"request failed. code: {response.code}, message:{response.message}\nabort after multiple retries...")
else:
print(f"request failed. code: {response.code}, message:{response.message}\naborted after multiple retries...")
return ''
if logger:
logger.debug(f'result:\n {response.output.choices[0]}')
logger.debug(f'usage:\n {response.usage}')
return response.output.choices[0]['message']['content']
if __name__ == '__main__':
from pprint import pprint
# logging.basicConfig(level=logging.DEBUG)
system_content = ''
user_content = '''你是一名优秀的翻译,请帮我把如下新闻标题逐条(一行为一条)翻译为中文,你的输出也必须为一条一行的格式。
The New York Times reported on 2021-01-01 that the COVID-19 cases in China are increasing.
Cyber ops linked to Israel-Hamas conflict largely improvised, researchers say
Russian hackers disrupted Ukrainian electrical grid last year
Reform bill would overhaul controversial surveillance law
GitHub disables pro-Russian hacktivist DDoS pages
Notorious Russian hacking group appears to resurface with fresh cyberattacks on Ukraine
Russian hackers attempted to breach petroleum refining company in NATO country, researchers say
As agencies move towards multi-cloud networks, proactive security is key
Keeping a competitive edge in the cybersecurity game
Mud, sweat and data: The hard work of democratizing data at scale
SEC sues SolarWinds and CISO for fraud
Cyber workforce demand is outpacing supply, survey finds
Four dozen countries declare they won
White House executive order on AI seeks to address security risks
malware resembling NSA code
CISA budget cuts would be
Hackers that breached Las Vegas casinos rely on violent threats, research shows'''
data = [{'role': 'user', 'content': user_content}]
start_time = time.time()
pprint(dashscope_llm(data, 'qwen-72b-chat'))
print(f'time cost: {time.time() - start_time}')

View File

@ -1,79 +0,0 @@
# 使用lmdepoly_wrapper的api封装
# 非流式接口
# 为了兼容性输入输出都使用message格式与openai SDK格式一致
from lagent.llms.meta_template import INTERNLM2_META as META
from lagent.llms.lmdepoly_wrapper import LMDeployClient
from requests import ConnectionError
import os
def lmdeploy_llm(messages: list[dict],
model: str = "qwen-7b",
seed: int = 1234,
max_tokens: int = 2000,
temperature: float = 1,
stop: list = None,
enable_search: bool = False,
logger=None) -> str:
if logger:
logger.debug(f'messages:\n {messages}')
logger.debug(f'params:\n model: {model}, max_tokens: {max_tokens}, temperature: {temperature}, stop: {stop},'
f'enable_search: {enable_search}, seed: {seed}')
top_p = 0.7
url = os.environ.get('LLM_API_BASE', "http://127.0.0.1:6003")
api_client = LMDeployClient(model_name=model,
url=url,
meta_template=META,
max_new_tokens=max_tokens,
top_p=top_p,
top_k=100,
temperature=temperature,
repetition_penalty=1.0,
stop_words=['<|im_end|>'])
response = ""
for i in range(3):
try:
response = api_client.chat(messages)
break
except ConnectionError:
if logger:
logger.warning(f'ConnectionError, url:{url}')
else:
print(f"ConnectionError, url:{url}")
return ""
return response
if __name__ == '__main__':
import time
from pprint import pprint
# logging.basicConfig(level=logging.DEBUG)
system_content = ''
user_content = '''你是一名优秀的翻译,请帮我把如下新闻标题逐条(一行为一条)翻译为中文,你的输出也必须为一条一行的格式。
The New York Times reported on 2021-01-01 that the COVID-19 cases in China are increasing.
Cyber ops linked to Israel-Hamas conflict largely improvised, researchers say
Russian hackers disrupted Ukrainian electrical grid last year
Reform bill would overhaul controversial surveillance law
GitHub disables pro-Russian hacktivist DDoS pages
Notorious Russian hacking group appears to resurface with fresh cyberattacks on Ukraine
Russian hackers attempted to breach petroleum refining company in NATO country, researchers say
As agencies move towards multi-cloud networks, proactive security is key
Keeping a competitive edge in the cybersecurity game
Mud, sweat and data: The hard work of democratizing data at scale
SEC sues SolarWinds and CISO for fraud
Cyber workforce demand is outpacing supply, survey finds
Four dozen countries declare they won
White House executive order on AI seeks to address security risks
malware resembling NSA code
CISA budget cuts would be
Hackers that breached Las Vegas casinos rely on violent threats, research shows'''
data = [{'role': 'user', 'content': user_content}]
start_time = time.time()
pprint(lmdeploy_llm(data, 'qwen-7b'))
print(f'time cost: {time.time() - start_time}')

View File

@ -1,15 +0,0 @@
#!/bin/sh
docker run -d --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
--env "LMDEPLOY_USE_MODELSCOPE=True" \
--env "TOKENIZERS_PARALLELISM=False" \
--name qwen1.5-7b-service \
-p 6003:6003 \
--restart=always \
--ipc=host \
openmmlab/lmdeploy:v0.2.5 \
pip install modelscope & \
lmdeploy serve api_server Qwen/Qwen1.5-7B-Chat \
--server-name 0.0.0.0 --server-port 6003 --tp 1 --rope-scaling-factor 1 --backend pytorch

View File

@ -1,19 +0,0 @@
fastapi
pydantic
uvicorn
dashscope #optional使用阿里灵积时安装
openai #optional使用兼容openai sdk的llm服务时安装
volcengine #optional(使用火山翻译时安装)
python-docx
BCEmbedding==0.1.3
langchain==0.1.0
langchain-community==0.0.9
langchain-core==0.1.7
langsmith==0.0.77
# faiss-gpu for gpu environment
faiss-cpu # for cpu-only environment
pocketbase==0.10.0
gne
chardet
schedule
loguru

View File

@ -1,4 +0,0 @@
scraper_map = {
}

View File

@ -1,4 +0,0 @@
set -o allexport
source ../.env
set +o allexport
python background_task.py

View File

@ -1,158 +0,0 @@
import os
import json
import requests
from datetime import datetime, timedelta, date
from scrapers import scraper_map
from scrapers.general_scraper import general_scraper
from urllib.parse import urlparse
from get_insight import get_insight, pb, logger
from general_utils import is_chinese
from tranlsation_volcengine import text_translate
import concurrent.futures
# 一般用于第一次爬虫时避免抓取过多太久的文章同时超过这个天数的数据库文章也不会再用来匹配insight
expiration_date = datetime.now() - timedelta(days=90)
expiration_date = expiration_date.date()
expiration_str = expiration_date.strftime("%Y%m%d")
class ServiceProcesser:
def __init__(self, record_snapshot: bool = False):
self.project_dir = os.environ.get("PROJECT_DIR", "")
# 1. base initialization
self.cache_url = os.path.join(self.project_dir, 'scanning_task')
os.makedirs(self.cache_url, exist_ok=True)
# 2. load the llm
# self.llm = LocalLlmWrapper() # if you use the local-llm
if record_snapshot:
snap_short_server = os.environ.get('SNAPSHOTS_SERVER', '')
if not snap_short_server:
raise Exception('SNAPSHOTS_SERVER is not set.')
self.snap_short_server = f"http://{snap_short_server}"
else:
self.snap_short_server = None
logger.info('scanning task init success.')
def __call__(self, expiration: date = expiration_date, sites: list[str] = None):
# 先清空一下cache
logger.info(f'wake, prepare to work, now is {datetime.now()}')
cache = {}
logger.debug(f'clear cache -- {cache}')
# 从pb数据库中读取所有文章url
# 这里publish_time用int格式综合考虑下这个是最容易操作的模式虽然糙了点
existing_articles = pb.read(collection_name='articles', fields=['id', 'title', 'url'], filter=f'publish_time>{expiration_str}')
all_title = {}
existings = []
for article in existing_articles:
all_title[article['title']] = article['id']
existings.append(article['url'])
# 定义扫描源列表如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况这时应该使用通用爬虫
sources = sites if sites else list(scraper_map.keys())
new_articles = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for site in sources:
if site in scraper_map:
futures.append(executor.submit(scraper_map[site], expiration, existings))
else:
futures.append(executor.submit(general_scraper, site, expiration, existings, logger))
concurrent.futures.wait(futures)
for future in futures:
try:
new_articles.extend(future.result())
except Exception as e:
logger.error(f'error when scraping-- {e}')
for value in new_articles:
if not value:
continue
from_site = urlparse(value['url']).netloc
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
if value['abstract']:
value['abstract'] = f"({from_site} 报道){value['abstract']}"
value['content'] = f"({from_site} 报道){value['content']}"
value['images'] = json.dumps(value['images'])
article_id = pb.add(collection_name='articles', body=value)
if article_id:
cache[article_id] = value
all_title[value['title']] = article_id
else:
logger.warning(f'add article {value} failed, writing to cache_file')
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(value, f, ensure_ascii=False, indent=4)
if not cache:
logger.warning(f'no new articles. now is {datetime.now()}')
return
# insight 流程
new_insights = get_insight(cache, all_title)
if new_insights:
for insight in new_insights:
if not insight['content']:
continue
insight_id = pb.add(collection_name='insights', body=insight)
if not insight_id:
logger.warning(f'write insight {insight} to pb failed, writing to cache_file')
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a', encoding='utf-8') as f:
json.dump(insight, f, ensure_ascii=False, indent=4)
for article_id in insight['articles']:
raw_article = pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
if not raw_article or not raw_article[0]:
logger.warning(f'get article {article_id} failed, skipping')
continue
if raw_article[0]['translation_result']:
continue
if is_chinese(raw_article[0]['title']):
continue
translate_text = text_translate([raw_article[0]['title'], raw_article[0]['abstract']], target_language='zh', logger=logger)
if translate_text:
related_id = pb.add(collection_name='article_translation', body={'title': translate_text[0], 'abstract': translate_text[1], 'raw': article_id})
if not related_id:
logger.warning(f'write article_translation {article_id} failed')
else:
_ = pb.update(collection_name='articles', id=article_id, body={'translation_result': related_id})
if not _:
logger.warning(f'update article {article_id} failed')
else:
logger.warning(f'translate article {article_id} failed')
else:
# 尝试把所有文章的title作为insigts这是备选方案
if len(cache) < 25:
logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
for key, value in cache.items():
if value['title']:
if is_chinese(value['title']):
text_for_insight = value['title']
else:
text_for_insight = text_translate([value['title']], logger=logger)
if text_for_insight:
insight_id = pb.add(collection_name='insights', body={'content': text_for_insight[0], 'articles': [key]})
if not insight_id:
logger.warning(f'write insight {text_for_insight[0]} to pb failed, writing to cache_file')
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a',
encoding='utf-8') as f:
json.dump({'content': text_for_insight[0], 'articles': [key]}, f, ensure_ascii=False, indent=4)
else:
logger.warning('generate_insights-error: can not generate insights, pls re-try')
logger.info(f'work done, now is {datetime.now()}')
if self.snap_short_server:
logger.info(f'now starting article snapshot with {self.snap_short_server}')
for key, value in cache.items():
if value['url']:
try:
snapshot = requests.get(f"{self.snap_short_server}/zip", {'url': value['url']}, timeout=60)
file = open(snapshot.text, 'rb')
_ = pb.upload('articles', key, 'snapshot', key, file)
file.close()
except Exception as e:
logger.warning(f'error when snapshot {value["url"]}, {e}')
logger.info(f'now snapshot done, now is {datetime.now()}')

View File

@ -1,31 +0,0 @@
services:
web:
build:
dockerfile: Dockerfile.web
image: wiseflow/web
ports:
- 8090:8090
# env_file:
# - .env
volumes:
- ./pb/pb_data:/pb/pb_data
# - ./${PROJECT_DIR}:/pb/${PROJECT_DIR}
entrypoint: /pb/pocketbase serve --http=0.0.0.0:8090
api:
build:
dockerfile: Dockerfile.api
image: wiseflow/api
tty: true
stdin_open: true
entrypoint: bash docker_entrypoint.sh
env_file:
- .env
ports:
- 7777:7777
volumes:
- ./${PROJECT_DIR}:/app/${PROJECT_DIR}
- ${EMBEDDING_MODEL_PATH}:${EMBEDDING_MODEL_PATH}
- ${RERANKER_MODEL_PATH}:${RERANKER_MODEL_PATH}
depends_on:
- web

View File

@ -1,14 +0,0 @@
export LLM_API_KEY=""
export LLM_API_BASE="" ##使用本地模型服务或者使用openai_wrapper调用非openai服务时用
export VOLC_KEY="AK|SK"
#**for embeddig model**
export EMBEDDING_MODEL_PATH="" ##填写完整的绝对路径
export RERANKER_MODEL_PATH="" ##填写完整的绝对路径
export DEVICE="cpu" ##cuda用户填写 "cuda:0"
#**for processer**
export PROJECT_DIR="work_dir"
export PB_API_AUTH="test@example.com|123467890"
export PB_API_BASE="web:8090" ##可以参考https://stackoverflow.com/questions/70151702/how-to-network-2-separate-docker-containers-to-communicate-with-eachother
export WS_LOG="verbose" ##如果需要详细的log观察系统的每一步动作填写此项正常使用无需

View File

@ -1 +0,0 @@
v0.2.1

47
core/dm.py Normal file
View File

@ -0,0 +1,47 @@
import asyncio
import websockets
import concurrent.futures
import json
from insights import pipeline
async def get_public_msg():
uri = "ws://127.0.0.1:8066/ws/publicMsg"
reconnect_attempts = 0
max_reconnect_attempts = 3 # 可以根据需要设置最大重连次数
while True:
try:
async with websockets.connect(uri, max_size=10 * 1024 * 1024) as websocket:
loop = asyncio.get_running_loop()
with concurrent.futures.ThreadPoolExecutor() as pool:
while True:
response = await websocket.recv()
datas = json.loads(response)
for data in datas["data"]:
if data["IsSender"] != "0":
print('self-send message, pass')
print(data)
continue
input_data = {
"user_id": data["StrTalker"],
"type": "publicMsg",
"content": data["Content"],
"addition": data["MsgSvrID"]
}
await loop.run_in_executor(pool, pipeline, input_data)
except websockets.exceptions.ConnectionClosedError as e:
print(f"Connection closed with exception: {e}")
reconnect_attempts += 1
if reconnect_attempts <= max_reconnect_attempts:
print(f"Reconnecting attempt {reconnect_attempts}...")
await asyncio.sleep(5) # 等待一段时间后重试
else:
print("Max reconnect attempts reached. Exiting.")
break
except Exception as e:
print(f"An unexpected error occurred: {e}")
break
# 使用asyncio事件循环运行get_public_msg coroutine
asyncio.run(get_public_msg())

164
core/insights/__init__.py Normal file
View File

@ -0,0 +1,164 @@
from scrapers import *
from utils.general_utils import extract_urls, compare_phrase_with_list
from insights.get_info import get_info, pb, project_dir, logger
from insights.rewrite import info_rewrite
import os
import json
from datetime import datetime, timedelta
from urllib.parse import urlparse
import re
import time
# 用正则不用xml解析方案是因为公众号消息提取出来的xml代码存在异常字符
item_pattern = re.compile(r'<item>(.*?)</item>', re.DOTALL)
url_pattern = re.compile(r'<url><!\[CDATA\[(.*?)]]></url>')
summary_pattern = re.compile(r'<summary><!\[CDATA\[(.*?)]]></summary>', re.DOTALL)
expiration_days = 3
existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
def pipeline(_input: dict):
cache = {}
source = _input['user_id'].split('@')[-1]
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
if _input['type'] == 'publicMsg':
items = item_pattern.findall(_input["content"])
# 遍历所有<item>内容,提取<url>和<summary>
for item in items:
url_match = url_pattern.search(item)
url = url_match.group(1) if url_match else None
if not url:
logger.warning(f"can not find url in \n{item}")
continue
# url处理http换成https, 去掉chksm之后的部分
url = url.replace('http://', 'https://')
cut_off_point = url.find('chksm=')
if cut_off_point != -1:
url = url[:cut_off_point-1]
if url in cache:
logger.debug(f"{url} already find in item")
continue
summary_match = summary_pattern.search(item)
summary = summary_match.group(1) if summary_match else None
cache[url] = summary
urls = list(cache.keys())
elif _input['type'] == 'text':
urls = extract_urls(_input['content'])
if not urls:
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
return
elif _input['type'] == 'url':
urls = []
pass
else:
return
global existing_urls
for url in urls:
# 0、先检查是否已经爬取过
if url in existing_urls:
logger.debug(f"{url} has been crawled, skip")
continue
logger.debug(f"fetching {url}")
# 1、选择合适的爬虫fetch article信息
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, article = mp_crawler(url, logger)
if flag == -7:
# 对于mp爬虫-7 的大概率是被微信限制了等待1min即可
logger.info(f"fetch {url} failed, try to wait 1min and try again")
time.sleep(60)
flag, article = mp_crawler(url, logger)
else:
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in scraper_map:
flag, article = scraper_map[domain](url, logger)
else:
flag, article = simple_crawler(url, logger)
if flag == -7:
# -7 代表网络不同,用其他爬虫也没有效果
logger.info(f"cannot fetch {url}")
continue
if flag != 11:
logger.info(f"{url} failed with mp_crawler and simple_crawler")
flag, article = llm_crawler(url, logger)
if flag != 11:
logger.info(f"{url} failed with llm_crawler")
continue
# 2、判断是否早于 当日- expiration_days ,如果是的话,舍弃
expiration_date = datetime.now() - timedelta(days=expiration_days)
expiration_date = expiration_date.strftime('%Y-%m-%d')
article_date = int(article['publish_time'])
if article_date < int(expiration_date.replace('-', '')):
logger.info(f"publish date is {article_date}, too old, skip")
continue
article['source'] = source
if cache[url]:
article['abstract'] = cache[url]
# 3、使用content从中提炼信息
insights = get_info(f"标题:{article['title']}\n\n内容:{article['content']}")
# 提炼info失败的article不入库不然在existing里面后面就再也不会处理了但提炼成功没有insight的article需要入库后面不再分析。
# 4、article入库
try:
article_id = pb.add(collection_name='articles', body=article)
except Exception as e:
logger.error(f'add article failed, writing to cache_file - {e}')
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(article, f, ensure_ascii=False, indent=4)
continue
existing_urls.append(url)
if not insights:
continue
# insight 比对去重与合并, article打标签insight入库
article_tags = set()
# 从数据库中读取过去expiration_days的insight记录避免重复
old_insights = pb.read(collection_name='insights', filter=f"updated>'{expiration_date}'", fields=['id', 'tag', 'content', 'articles'])
for insight in insights:
article_tags.add(insight['tag'])
insight['articles'] = [article_id]
# 从old_insights 中挑出相同tag的insight组成 content: id 的反查字典
old_insight_dict = {i['content']: i for i in old_insights if i['tag'] == insight['tag']}
# 因为要比较的是抽取出来的信息短语是否讲的是一个事情,用向量模型计算相似度未必适合且过重
# 因此这里使用一个简化的方案直接使用jieba分词器计算两个短语之间重叠的词语是否超过90%
similar_insights = compare_phrase_with_list(insight['content'], list(old_insight_dict.keys()), 0.65)
if similar_insights:
to_rewrite = similar_insights + [insight['content']]
new_info_content = info_rewrite(to_rewrite, logger)
if not new_info_content:
continue
insight['content'] = new_info_content
# 合并关联article、删除旧insight
for old_insight in similar_insights:
insight['articles'].extend(old_insight_dict[old_insight]['articles'])
pb.delete(collection_name='insights', id=old_insight_dict[old_insight]['id'])
old_insights.remove(old_insight_dict[old_insight])
try:
insight['id'] = pb.add(collection_name='insights', body=insight)
# old_insights.append(insight)
except Exception as e:
logger.error(f'add insight failed, writing to cache_file - {e}')
with open(os.path.join(project_dir, 'cache_insights.json'), 'a', encoding='utf-8') as f:
json.dump(insight, f, ensure_ascii=False, indent=4)
try:
pb.update(collection_name='articles', id=article_id, body={'tag': list(article_tags)})
except Exception as e:
logger.error(f'update article failed - article_id: {article_id}\n{e}')
article['tag'] = list(article_tags)
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(article, f, ensure_ascii=False, indent=4)

94
core/insights/get_info.py Normal file
View File

@ -0,0 +1,94 @@
# from llms.dashscope_wrapper import dashscope_llm
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
import re
from utils.general_utils import get_logger_level
from loguru import logger
from utils.pb_api import PbTalker
import os
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'insights.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
model = "glm-4-flash"
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
focus_list = [item["name"] for item in focus_data if item["name"]]
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下:
{focus_list}
如果新闻中包含上述任何类型的信息请使用以下格式标记信息的类型并提供仅包含时间地点人物和事件的一句话信息摘要
<tag>类型名称</tag>仅包含时间地点人物和事件的一句话信息摘要
如果新闻中包含多个信息请逐一分析并按一条一行的格式输出如果新闻不涉及任何类型的信息则直接输出
务必注意1严格忠于新闻原文不得提供原文中不包含的信息2对于同一事件仅选择一个最贴合的tag不要重复输出3仅用一句话做信息摘要且仅包含时间地点人物和事件4严格遵循给定的格式输出'''
# pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
def get_info(article_content: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
model=model, logger=logger)
# results = pattern.findall(result)
texts = result.split('<tag>')
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
if not texts:
logger.info(f'can not find info, llm result:\n{result}')
return []
cache = []
for text in texts:
try:
strings = text.split('</tag>')
tag = strings[0]
tag = tag.strip()
if tag not in focus_list:
logger.info(f'tag not in focus_list: {tag}, aborting')
continue
info = ''.join(strings[1:])
info = info.strip()
except Exception as e:
logger.info(f'parse error: {e}')
tag = ''
info = ''
if not info or not tag:
logger.info(f'parse failed-{text}')
continue
if len(info) < 7:
logger.info(f'info too short, possible invalid: {info}')
continue
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
logger.info(f'no relevant info: {text}')
continue
while info.endswith('"'):
info = info[:-1]
info = info.strip()
# 拼接下来源信息
sources = re.findall(r'内容:\((.*?) 文章\)', article_content)
if sources and sources[0]:
info = f"{sources[0]} 公众号】 {info}"
cache.append({'content': info, 'tag': focus_dict[tag]})
return cache

24
core/insights/rewrite.py Normal file
View File

@ -0,0 +1,24 @@
# from llms.openai_wrapper import openai_llm
from llms.dashscope_wrapper import dashscope_llm
# from llms.siliconflow_wrapper import sfa_llm
rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。
给到的内容会用XML标签分隔
请仅输出总结出的摘要不要输出其他的信息'''
model = "qwen2-7b-instruct"
def info_rewrite(contents: list[str], logger=None) -> str:
context = f"<content>{'</content><content>'.join(contents)}</content>"
try:
result = dashscope_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
model=model, temperature=0.1, logger=logger)
return result.strip()
except Exception as e:
if logger:
logger.warning(f'rewrite process llm generate failed: {e}')
else:
print(f'rewrite process llm generate failed: {e}')
return ''

View File

@ -7,13 +7,13 @@ import os
from openai import OpenAI
token = os.environ.get('LLM_API_KEY', "")
if not token:
raise ValueError('请设置环境变量LLM_API_KEY')
base_url = os.environ.get('LLM_API_BASE', "")
token = os.environ.get('LLM_API_KEY', "")
client = OpenAI(api_key=token, base_url=base_url)
if token:
client = OpenAI(api_key=token, base_url=base_url)
else:
client = OpenAI(base_url=base_url)
def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:

View File

@ -0,0 +1,70 @@
"""
siliconflow api wrapper
https://siliconflow.readme.io/reference/chat-completions-1
"""
import os
import requests
token = os.environ.get('LLM_API_KEY', "")
if not token:
raise ValueError('请设置环境变量LLM_API_KEY')
url = "https://api.siliconflow.cn/v1/chat/completions"
def sfa_llm(messages: list, model: str, logger=None, **kwargs) -> str:
if logger:
logger.debug(f'messages:\n {messages}')
logger.debug(f'model: {model}')
logger.debug(f'kwargs:\n {kwargs}')
payload = {
"model": model,
"messages": messages
}
payload.update(kwargs)
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {token}"
}
for i in range(2):
try:
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
try:
body = response.json()
usage = body.get('usage', 'Field "usage" not found')
choices = body.get('choices', 'Field "choices" not found')
if logger:
logger.debug(choices)
logger.debug(usage)
return choices[0]['message']['content']
except ValueError:
# 如果响应体不是有效的JSON格式
if logger:
logger.warning("Response body is not in JSON format.")
else:
print("Response body is not in JSON format.")
except requests.exceptions.RequestException as e:
if logger:
logger.warning(f"A request error occurred: {e}")
else:
print(f"A request error occurred: {e}")
if logger:
logger.info("retrying...")
else:
print("retrying...")
if logger:
logger.error("After many time, finally failed to get response from API.")
else:
print("After many time, finally failed to get response from API.")
return ''

1016
core/pb/CHANGELOG.md Normal file

File diff suppressed because it is too large Load Diff

17
core/pb/LICENSE.md Normal file
View File

@ -0,0 +1,17 @@
The MIT License (MIT)
Copyright (c) 2022 - present, Gani Georgiev
Permission is hereby granted, free of charge, to any person obtaining a copy of this software
and associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,44 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "iorna912",
"name": "content",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "iorna912",
"name": "content",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,31 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "d13734ez",
"name": "tag",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// remove
collection.schema.removeField("d13734ez")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,31 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "pwy2iz0b",
"name": "source",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// remove
collection.schema.removeField("pwy2iz0b")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,51 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "nvf6k0yoiclmytu",
"created": "2024-05-16 01:36:01.108Z",
"updated": "2024-05-16 01:36:01.108Z",
"name": "tags",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "l6mm7m90",
"name": "activated",
"type": "bool",
"required": false,
"presentable": false,
"unique": false,
"options": {}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu");
return dao.deleteCollection(collection);
})

View File

@ -0,0 +1,52 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// remove
collection.schema.removeField("d13734ez")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "d13734ez",
"name": "tag",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
// remove
collection.schema.removeField("j65p3jji")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,16 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.listRule = null
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,16 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,33 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "8d9woe75",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
// remove
collection.schema.removeField("8d9woe75")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,33 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "famdh2fv",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// remove
collection.schema.removeField("famdh2fv")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,18 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,33 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "lbxw5pra",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// remove
collection.schema.removeField("lbxw5pra")
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,18 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,44 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,48 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": 1,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,18 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
collection.listRule = "@request.auth.id != \"\""
collection.viewRule = "@request.auth.id != \"\""
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -0,0 +1,6 @@
from .mp_crawler import mp_crawler
from .simple_crawler import simple_crawler
from .general_scraper import llm_crawler
scraper_map = {}

View File

@ -6,12 +6,16 @@ import httpx
from bs4 import BeautifulSoup
from bs4.element import Comment
from llms.dashscope_wrapper import dashscope_llm
# from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from datetime import datetime, date
from requests.compat import urljoin
import chardet
from general_utils import extract_and_convert_dates
from utils.general_utils import extract_and_convert_dates
model = "qwen-long"
# model = "deepseek-chat"
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
@ -62,7 +66,7 @@ def parse_html_content(out: str) -> dict:
# qwen1.5-72b解析json格式太容易出错网页上的情况太多比如经常直接使用英文的"这样后面json.loads就容易出错……
sys_info = '''你是一个html网页解析器你将接收一段用户从网页html文件中提取的文本请解析出其标题、摘要、内容和发布日期发布日期格式为YYYY-MM-DD。
sys_info = '''你是一个html解析器你将接收一段html代码请解析出其标题、摘要、内容和发布日期发布日期格式为YYYY-MM-DD。
结果请按照以下格式返回整体用三引号包裹
"""
标题||摘要||内容||发布日期XXXX-XX-XX
@ -105,7 +109,8 @@ def llm_crawler(url: str | Path, logger) -> (int, dict):
{"role": "system", "content": sys_info},
{"role": "user", "content": html_text}
]
llm_output = dashscope_llm(messages, "qwen1.5-72b-chat", logger=logger)
llm_output = dashscope_llm(messages, model=model, logger=logger)
# llm_output = openai_llm(messages, model=model, logger=logger)
try:
info = parse_html_content(llm_output)
except Exception:

109
core/scrapers/mp_crawler.py Normal file
View File

@ -0,0 +1,109 @@
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def mp_crawler(url: str, logger) -> (int, dict):
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
url = url.replace("http://", "https://", 1)
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
soup = BeautifulSoup(response.text, 'html.parser')
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
if card_info \
else soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"({profile_nickname} 文章){content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"({profile_nickname} 文章){summary}"
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
return 11, {
'title': rich_media_title,
'author': profile_nickname,
'publish_time': publish_time,
'abstract': abstract,
'content': content,
'images': list(images),
'url': url,
}

View File

@ -3,7 +3,7 @@ import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from general_utils import extract_and_convert_dates
from utils.general_utils import extract_and_convert_dates
import chardet

152
core/tasks.py Normal file
View File

@ -0,0 +1,152 @@
"""
通过编辑这个脚本可以自定义需要的后台任务
"""
import schedule
import time
from topnews import pipeline
from loguru import logger
from utils.pb_api import PbTalker
import os
from utils.general_utils import get_logger_level
from datetime import datetime, timedelta
import pytz
import requests
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'tasks.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
utc_now = datetime.now(pytz.utc)
# 减去一天得到前一天的UTC时间
utc_yesterday = utc_now - timedelta(days=1)
utc_last = utc_yesterday.strftime("%Y-%m-%d %H:%M:%S")
def task():
"""
global counter
sites = pb.read('sites', filter='activated=True')
urls = []
for site in sites:
if not site['per_hours'] or not site['url']:
continue
if counter % site['per_hours'] == 0:
urls.append(site['url'])
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
logger.info(urls)
if urls:
sp(sites=urls)
else:
if counter % 24 == 0:
sp()
else:
print('\033[0;33mno work for this loop\033[0m')
counter += 1
"""
global utc_last
logger.debug(f'last_collect_time: {utc_last}')
datas = pb.read(collection_name='insights', filter=f'updated>="{utc_last}"', fields=['id', 'content', 'tag', 'articles'])
logger.debug(f"got {len(datas)} items")
utc_last = datetime.now(pytz.utc).strftime("%Y-%m-%d %H:%M:%S")
logger.debug(f'now_utc_time: {utc_last}')
tags = pb.read(collection_name='tags', filter=f'activated=True')
tags_dict = {item["id"]: item["name"] for item in tags if item["name"]}
top_news = {}
for id, name in tags_dict.items():
logger.debug(f'tag: {name}')
data = [item for item in datas if item['tag'] == id]
topnew = pipeline(data, logger)
if not topnew:
logger.debug(f'no top news for {name}')
continue
top_news[id] = {}
for content, articles in topnew.items():
content_urls = [pb.read('articles', filter=f'id="{a}"', fields=['url'])[0]['url'] for a in articles]
# 去除重叠内容
# 如果发现重叠内容,哪个标签长就把对应的从哪个标签删除
to_skip = False
for k, v in top_news.items():
to_del_key = None
for c, u in v.items():
if not set(content_urls).isdisjoint(set(u)):
if len(topnew) > len(v):
to_skip = True
else:
to_del_key = c
break
if to_del_key:
del top_news[k][to_del_key]
if to_skip:
break
if not to_skip:
top_news[id][content] = content_urls
if not top_news[id]:
del top_news[id]
if not top_news:
logger.info("no top news today")
return
# 序列化为字符串
top_news_text = {"#党建引领基层治理": [],
"#数字社区": [],
"#优秀活动案例": []}
for id, v in top_news.items():
# top_news[id] = {content: '\n\n'.join(urls) for content, urls in v.items()}
top_news[id] = {content: urls[0] for content, urls in v.items()}
if id == 's3kqj9ek8nvtthr':
top_news_text["#数字社区"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
elif id == 'qpcgotbqyz3a617':
top_news_text["#优秀活动案例"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
else:
top_news_text["#党建引领基层治理"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
top_news_text = {k: "\n".join(v) for k, v in top_news_text.items()}
top_news_text = "\n\n".join(f"{k}\n{v}" for k, v in top_news_text.items())
logger.info(top_news_text)
data = {
"wxid": "R:10860349446619856",
"content": top_news_text
}
try:
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
if response.status_code == 200:
logger.info("send message to wechat success")
time.sleep(1)
data = {
"wxid": "R:10860349446619856",
"content": "[太阳] 今日份的临小助内参来啦!",
"atlist": ["@all"]
}
try:
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
if response.status_code == 200:
logger.info("send notify to wechat success")
except Exception as e:
logger.error(f"send notify to wechat failed: {e}")
except Exception as e:
logger.error(f"send message to wechat failed: {e}")
schedule.every().day.at("07:38").do(task)
task()
while True:
schedule.run_pending()
time.sleep(60)

0
core/utils/__init__.py Normal file
View File

View File

@ -6,6 +6,7 @@ from urllib.parse import urlparse
import time
import os
import re
import jieba
def isURL(string):
@ -13,6 +14,15 @@ def isURL(string):
return result.scheme != '' and result.netloc != ''
def extract_urls(text):
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
urls = re.findall(url_pattern, text)
# 过滤掉那些只匹配到 'www.' 而没有后续内容的情况并尝试为每个URL添加默认的http协议前缀以便解析
cleaned_urls = [url for url in urls if isURL(url)]
return cleaned_urls
def isChinesePunctuation(char):
# 定义中文标点符号的Unicode编码范围
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
@ -162,6 +172,29 @@ def get_logger_level() -> str:
return level_map.get(level, 'info')
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
"""
比较一个目标短语与短语列表中每个短语的相似度
:param target_phrase: 目标短语 (str)
:param phrase_list: 短语列表 (list of str)
:param threshold: 相似度阈值 (float)
:return: 满足相似度条件的短语列表 (list of str)
"""
# 检查目标短语是否为空
if not target_phrase:
return [] # 目标短语为空,直接返回空列表
# 预处理:对目标短语和短语列表中的每个短语进行分词
target_tokens = set(jieba.lcut(target_phrase))
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
# 比较并筛选
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
return similar_phrases
"""
# from InternLM/huixiangdou
# another awsome work

View File

@ -13,28 +13,28 @@ class PbTalker:
self.client = PocketBase(url)
auth = os.environ.get('PB_API_AUTH', '')
if not auth or "|" not in auth:
self.logger.warning(f"invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
self.logger.warnning("invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
else:
email, password = auth.split('|')
_ = self.client.admins.auth_with_password(email, password)
if _:
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
else:
raise Exception(f"pocketbase auth failed")
try:
admin_data = self.client.admins.auth_with_password(email, password)
if admin_data:
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
except:
user_data = self.client.collection("users").auth_with_password(email, password)
if user_data:
self.logger.info(f"pocketbase ready authenticated as user - {email}")
else:
raise Exception("pocketbase auth failed")
def read(self, collection_name: str, fields: list[str] = None, filter: str = '', skiptotal: bool = True) -> list:
results = []
for i in range(1, 10):
try:
if fields:
res = self.client.collection(collection_name).get_list(i, 500,
{"filter": filter,
"fields": ','.join(fields),
"skiptotal": skiptotal})
else:
res = self.client.collection(collection_name).get_list(i, 500,
{"filter": filter,
"skiptotal": skiptotal})
res = self.client.collection(collection_name).get_list(i, 500,
{"filter": filter,
"fields": ','.join(fields) if fields else '',
"skiptotal": skiptotal})
except Exception as e:
self.logger.error(f"pocketbase get list failed: {e}")
@ -79,3 +79,11 @@ class PbTalker:
self.logger.error(f"pocketbase update failed: {e}")
return ''
return res.id
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
try:
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
return vars(res)
except Exception as e:
self.logger.error(f"pocketbase view item failed: {e}")
return {}

71
dashboard/README.md Normal file
View File

@ -0,0 +1,71 @@
**Included Web Dashboard Example**: This is optional. If you only use the data processing functions or have your own downstream task program, you can ignore everything in this folder!
## Main Features
1.Daily Insights Display
2.Daily Article Display
3.Appending Search for Specific Hot Topics (using Sogou engine)
4.Generating Word Reports for Specific Hot Topics
**Note: The code here cannot be used directly. It is adapted to an older version of the backend. You need to study the latest backend code in the `core` folder and make changes, especially in parts related to database integration!**
-----------------------------------------------------------------
附带的web Dashboard 示例,并非必须,如果你只是使用数据处理功能,或者你有自己的下游任务程序,可以忽略这个文件夹内的一切!
## 主要功能
1. 每日insights展示
2. 每日文章展示
3. 指定热点追加搜索使用sougou引擎
4. 指定热点生成word报告
**注意这里的代码并不能直接使用它适配的是旧版本的后端程序你需要研究core文件夹下的最新后端代码进行更改尤其是跟数据库对接的部分**
-----------------------------------------------------------------
**付属のWebダッシュボードのサンプル**:これは必須ではありません。データ処理機能のみを使用する場合、または独自の下流タスクプログラムを持っている場合は、このフォルダ内のすべてを無視できます!
## 主な機能
1. 毎日のインサイト表示
2. 毎日の記事表示
3. 特定のホットトピックの追加検索Sogouエンジンを使用
4. 特定のホットトピックのWordレポートの生成
**注意:ここにあるコードは直接使用できません。古いバージョンのバックエンドに適合しています。`core`フォルダ内の最新のバックエンドコードを調べ、特にデータベースとの連携部分について変更を行う必要があります!**
-----------------------------------------------------------------
**Exemple de tableau de bord Web inclus** : Ceci est facultatif. Si vous n'utilisez que les fonctions de traitement des données ou si vous avez votre propre programme de tâches en aval, vous pouvez ignorer tout ce qui se trouve dans ce dossier !
## Fonctions principales
1. Affichage des insights quotidiens
2. Affichage des articles quotidiens
3. Recherche supplémentaire pour des sujets populaires spécifiques (en utilisant le moteur Sogou)
4. Génération de rapports Word pour des sujets populaires spécifiques
**Remarque : Le code ici ne peut pas être utilisé directement. Il est adapté à une version plus ancienne du backend. Vous devez étudier le code backend le plus récent dans le dossier `core` et apporter des modifications, en particulier dans les parties relatives à l'intégration de la base de données !**
-----------------------------------------------------------------
**Beispiel eines enthaltenen Web-Dashboards**: Dies ist optional. Wenn Sie nur die Datenverarbeitungsfunktionen verwenden oder Ihr eigenes Downstream-Aufgabenprogramm haben, können Sie alles in diesem Ordner ignorieren!
## Hauptfunktionen
1. Tägliche Einblicke anzeigen
2. Tägliche Artikel anzeigen
3. Angehängte Suche nach spezifischen Hot Topics (unter Verwendung der Sogou-Suchmaschine)
4. Erstellen von Word-Berichten für spezifische Hot Topics
**Hinweis: Der Code hier kann nicht direkt verwendet werden. Er ist an eine ältere Version des Backends angepasst. Sie müssen den neuesten Backend-Code im `core`-Ordner studieren und Änderungen vornehmen, insbesondere in den Teilen, die die Datenbankintegration betreffen!**

View File

@ -2,7 +2,6 @@ import os
import time
import json
import uuid
from pb_api import PbTalker
from get_report import get_report, logger, pb
from get_search import search_insight
from tranlsation_volcengine import text_translate
@ -22,12 +21,6 @@ class BackendService:
logger.info('backend service init success.')
def report(self, insight_id: str, topics: list[str], comment: str) -> dict:
"""
:param insight_id: insight在pb中的id
:param topics: 书写报告的主题和大纲必传第一个值是标题后面是段落标题可以传空列表AI就自由发挥
:param comment: 修改意见可以传
:return: 成功的话返回更新后的insight_id其实跟原id一样, 不成功返回空字符
"""
logger.debug(f'got new report request insight_id {insight_id}')
insight = pb.read('insights', filter=f'id="{insight_id}"')
if not insight:
@ -48,8 +41,6 @@ class BackendService:
return self.build_out(-2, f'{insight_id} has no valid articles')
content = insight[0]['content']
# 这里把所有相关文章的content都要翻译成中文了分析使用中文因为涉及到部分专有词汇维护在火山的账户词典上大模型并不了解
# 发现翻译为中文后,引发灵积模型敏感词检测概率增加了,暂时放弃……
if insight_id in self.memory:
memory = self.memory[insight_id]
else:
@ -78,11 +69,7 @@ class BackendService:
def translate(self, article_ids: list[str]) -> dict:
"""
:param article_ids: 待翻译的文章id列表
:return: 成功的话flag 11负数为报错但依然可能部分任务完成可以稍后再次调用
返回中的msg记录了可能的错误
这个函数的作用是遍历列表中的id 如果对应articleid中没有translation_result则触发翻译并更新articleid记录
执行本函数后如果收到flag 11则可以再次从pb中请求article-id对应的translation_result
just for chinese users
"""
logger.debug(f'got new translate task {article_ids}')
flag = 11
@ -155,10 +142,6 @@ class BackendService:
return self.build_out(flag, msg)
def more_search(self, insight_id: str) -> dict:
"""
:param insight_id: insight在pb中的id
:return: 成功的话返回更新后的insight_id其实跟原id一样, 不成功返回空字符
"""
logger.debug(f'got search request for insight {insight_id}')
insight = pb.read('insights', filter=f'id="{insight_id}"')
if not insight:

View File

@ -0,0 +1,65 @@
from urllib.parse import urlparse
import os
import re
def isURL(string):
result = urlparse(string)
return result.scheme != '' and result.netloc != ''
def isChinesePunctuation(char):
# 定义中文标点符号的Unicode编码范围
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
# 检查字符是否在上述范围内
return ord(char) in chinese_punctuations
def is_chinese(string):
"""
使用火山引擎其实可以支持更加广泛的语言检测未来可以考虑 https://www.volcengine.com/docs/4640/65066
判断字符串中大部分是否是中文
:param string: {str} 需要检测的字符串
:return: {bool} 如果大部分是中文返回True否则返回False
"""
pattern = re.compile(r'[^\u4e00-\u9fa5]')
non_chinese_count = len(pattern.findall(string))
# It is easy to misjudge strictly according to the number of bytes less than half. English words account for a large number of bytes, and there are punctuation marks, etc
return (non_chinese_count/len(string)) < 0.68
def extract_and_convert_dates(input_string):
# Define regular expressions that match different date formats
patterns = [
r'(\d{4})-(\d{2})-(\d{2})', # YYYY-MM-DD
r'(\d{4})/(\d{2})/(\d{2})', # YYYY/MM/DD
r'(\d{4})\.(\d{2})\.(\d{2})', # YYYY.MM.DD
r'(\d{4})\\(\d{2})\\(\d{2})', # YYYY\MM\DD
r'(\d{4})(\d{2})(\d{2})' # YYYYMMDD
]
matches = []
for pattern in patterns:
matches = re.findall(pattern, input_string)
if matches:
break
if matches:
return ''.join(matches[0])
return None
def get_logger_level() -> str:
level_map = {
'silly': 'CRITICAL',
'verbose': 'DEBUG',
'info': 'INFO',
'warn': 'WARNING',
'error': 'ERROR',
}
level: str = os.environ.get('WS_LOG', 'info').lower()
if level not in level_map:
raise ValueError(
'WiseFlow LOG should support the values of `silly`, '
'`verbose`, `info`, `warn`, `error`'
)
return level_map.get(level, 'info')

View File

@ -1,7 +1,7 @@
import random
import re
import os
from llms.dashscope_wrapper import dashscope_llm
from backend.llms.dashscope_wrapper import dashscope_llm
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor

View File

@ -1,31 +1,21 @@
from scrapers.simple_crawler import simple_crawler
from .simple_crawler import simple_crawler
from .mp_crawler import mp_crawler
from typing import Union
from pathlib import Path
import requests
import re
import json
from urllib.parse import quote, urlparse
from urllib.parse import quote
from bs4 import BeautifulSoup
import time
# 国内的应用场景sogou搜索应该不错了还支持weixin、百科搜索
# 海外的应用场景可以考虑使用duckduckgo或者google_search的sdk
# 尽量还是不要自己host一个搜索引擎吧虽然有类似https://github.com/StractOrg/stract/tree/main的开源方案但毕竟这是两套工程
def search_insight(keyword: str, logger, exist_urls: list[Union[str, Path]], knowledge: bool = False) -> (int, list):
"""
搜索网页
:param keyword: 要搜索的主题
:param exist_urls: 已经存在的url列表即这些网页已经存在搜索结果中如果出现则跳过
:param knowledge: 是否搜索知识
:param logger: 日志
:return: 返回文章信息列表list[dict]和flag负数为报错0为没有结果11为成功
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
}
# 如果knowledge参数为真则意味着搜索概念知识这时只会搜索sogou百科
# 默认是搜索新闻资讯同时搜索sogou网页和资讯
# If the knowledge parameter is true, it means searching for conceptual knowledge, then only sogou encyclopedia will be searched
# The default is to search for news information, and search for sogou pages and information at the same time
if knowledge:
url = f"https://www.sogou.com/sogou?query={keyword}&insite=baike.sogou.com"
else:
@ -74,24 +64,24 @@ def search_insight(keyword: str, logger, exist_urls: list[Union[str, Path]], kno
if not relist:
return -7, []
# 这里仅使用simple_crawler, 因为search行为要快
results = []
for url in relist:
if url in exist_urls:
continue
exist_urls.append(url)
flag, value = simple_crawler(url, logger)
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, article = mp_crawler(url, logger)
if flag == -7:
logger.info(f"fetch {url} failed, try to wait 1min and try again")
time.sleep(60)
flag, article = mp_crawler(url, logger)
else:
flag, article = simple_crawler(url, logger)
if flag != 11:
continue
from_site = urlparse(url).netloc
if from_site.startswith('www.'):
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
if value['abstract']:
value['abstract'] = f"({from_site} 报道){value['abstract']}"
value['content'] = f"({from_site} 报道){value['content']}"
value['images'] = json.dumps(value['images'])
results.append(value)
results.append(article)
if results:
return 11, results
@ -102,7 +92,7 @@ def redirect_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
}
r = requests.get(url, headers=headers, allow_redirects=False) # 不允许重定向
r = requests.get(url, headers=headers, allow_redirects=False)
if r.status_code == 302:
real_url = r.headers.get('Location')
else:

View File

@ -1,4 +1,3 @@
# 这是后端服务的fastapi框架程序
from fastapi import FastAPI
from pydantic import BaseModel
from __init__ import BackendService
@ -17,13 +16,13 @@ class TranslateRequest(BaseModel):
class ReportRequest(BaseModel):
insight_id: str
toc: list[str] = [""] # 第一个元素为大标题其余为段落标题。第一个元素必须存在可以是空字符llm会自动拟标题。
toc: list[str] = [""] # The first element is a headline, and the rest are paragraph headings. The first element must exist, can be a null character, and llm will automatically make headings.
comment: str = ""
app = FastAPI(
title="首席情报官 Backend Server",
description="From DSW Team.",
title="wiseflow Backend Server",
description="From WiseFlow Team.",
version="0.2",
openapi_url="/openapi.json"
)
@ -36,13 +35,12 @@ app.add_middleware(
allow_headers=["*"],
)
# 如果有多个后端服务,可以在这里定义多个后端服务的实例
bs = BackendService()
@app.get("/")
def read_root():
msg = "Hello, 欢迎使用首席情报官 Backend."
msg = "Hello, This is WiseFlow Backend."
return {"msg": msg}

109
dashboard/mp_crawler.py Normal file
View File

@ -0,0 +1,109 @@
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def mp_crawler(url: str, logger) -> (int, dict):
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
url = url.replace("http://", "https://", 1)
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
soup = BeautifulSoup(response.text, 'html.parser')
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
if card_info \
else soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"({profile_nickname} 文章){content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"({profile_nickname} 文章){summary}"
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
return 11, {
'title': rich_media_title,
'author': profile_nickname,
'publish_time': publish_time,
'abstract': abstract,
'content': content,
'images': list(images),
'url': url,
}

View File

@ -0,0 +1,60 @@
from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from utils.general_utils import extract_and_convert_dates
import chardet
extractor = GeneralNewsExtractor()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def simple_crawler(url: str | Path, logger) -> (int, dict):
"""
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
"""
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding)
result = extractor.extract(text)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
if not result:
logger.error(f"gne cannot extract {url}")
return 0, {}
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"{result} not valid")
return 0, {}
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
date_str = extract_and_convert_dates(result['publish_time'])
if date_str:
result['publish_time'] = date_str
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = meta_description["content"].strip()
else:
result['abstract'] = ''
except Exception:
result['abstract'] = ''
result['url'] = str(url)
return 11, result

View File

@ -1,10 +1,11 @@
# 使用火山引擎进行翻译的接口封装
# 通过环境变量设置VOLC_KEY格式为AK|SK
# AK-SK 需要手机号注册并实名认证具体见这里https://console.volcengine.com/iam/keymanage/ (自助接入)
# 费用每月免费额度200万字符1个汉字、1个外语字母、1个数字、1个符号或空格都计为一个字符超出后49元/每百万字符
# 图片翻译每月免费100张超出后0.04元/张
# 文本翻译并发限制每个batch最多16个总文本长度不超过5000字符max QPS为10
# 术语库管理https://console.volcengine.com/translate
# Interface encapsulation for translation using Volcano Engine
# Set VOLC_KEY by environment variables in the format AK | SK
# AK-SK requires mobile phone number registration and real-name authentication, see here https://console.volcengine.com/iam/keymanage/(self-service access)
# Cost: Monthly free limit 2 million characters (1 Chinese character, 1 foreign language letter, 1 number, 1 symbol or space are counted as one character),
# exceeding 49 yuan/per million characters
# Picture translation: 100 pieces per month for free, 0.04 yuan/piece after exceeding
# Text translation concurrency limit, up to 16 per batch, the total text length does not exceed 5000 characters, max QPS is 10
# Terminology database management: https://console.volcengine.com/translate
import json
@ -18,7 +19,7 @@ from volcengine.base.Service import Service
VOLC_KEY = os.environ.get('VOLC_KEY', None)
if not VOLC_KEY:
raise Exception('请设置环境变量 VOLC_KEY格式为AK|SK')
raise Exception('Please set environment variables VOLC_KEY format as AK | SK')
k_access_key, k_secret_key = VOLC_KEY.split('|')

View File

Before

Width:  |  Height:  |  Size: 1.5 KiB

After

Width:  |  Height:  |  Size: 1.5 KiB

View File

Before

Width:  |  Height:  |  Size: 4.0 KiB

After

Width:  |  Height:  |  Size: 4.0 KiB

Some files were not shown because too many files have changed in this diff Show More