first commit for V0.3.22

This commit is contained in:
bigbrother666sh 2024-12-05 12:11:28 +08:00
parent 2e01ba5ba7
commit 61251547a0
51 changed files with 266 additions and 2215 deletions

View File

@ -1,21 +1,19 @@
FROM python:3.10-slim
RUN apt-get update && \
apt-get install -yq tzdata build-essential unzip && \
apt-get clean
apt-get install -y tzdata build-essential unzip
COPY core/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt
WORKDIR /app
COPY core/requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY core .
# download and unzip PocketBase
ADD https://github.com/pocketbase/pocketbase/releases/download/v0.22.13/pocketbase_0.22.13_linux_amd64.zip /tmp/pb.zip
ADD https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/pocketbase_0.23.4_linux_amd64.zip /tmp/pb.zip
# for arm device
# ADD https://github.com/pocketbase/pocketbase/releases/download/v0.22.13/pocketbase_0.22.13_linux_arm64.zip /tmp/pb.zip
# ADD https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/pocketbase_0.23.4_linux_arm64.zip /tmp/pb.zip
RUN unzip /tmp/pb.zip -d /app/pb/
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
EXPOSE 8090
EXPOSE 8077

View File

@ -42,11 +42,11 @@ https://github.com/TeamWiseFlow/wiseflow/assets/96130569/bd4b2091-c02d-4457-9ec6
但我们也注意到部分关注者对 wiseflow 的功能定位存在一些理解偏差,为免误会,我们制作了如下表格,清晰展示 wiseflow 与爬虫、AI搜索、知识库RAG类项目的对比
| | **首席情报官Wiseflow** |
| | **首席情报官Wiseflow** 的比较说明|
|-------------|-----------------|
| **爬虫类工具** | wiseflow 集成了很多优秀的开源爬虫工具,并增加了基于 LLM 的自动化信息过滤、筛选与分类能力,所以可以简单认为 wiseflow = 爬虫工具 + AI |
| **AI搜索** | AI搜索主要的应用场景是**具体问题的即时问答**举例”XX公司的创始人是谁“、“xx品牌下的xx产品哪里有售” wiseflow主要的应用场景是**某一方面信息的持续采集**比如XX公司的关联信息追踪XX品牌市场行为的持续追踪……在这些场景下用户只能提供关注点某公司、某品牌但无法提出具体搜索问题且并不是一次检索而是需要持续追踪或者自动化进行关联追踪您可以简单的把wiseflow理解为一个可持续自动进行 ai 搜索的“智能体”,即 “AI 情报官” |
| **知识库RAG类项目** | 知识库RAG类项目一般是基于已有信息的下游任务并且一般面向的是私有知识比如企业内的操作手册、产品手册、政府部门的文件等wiseflow 目前并未整合下游任务,同时面向的是互联网上的公开信息 |
| **爬虫类工具** | 首先 wiseflow 是基于爬虫工具的项目以目前的版本而言wiseflow 集成了优秀的开源爬虫项目 Crwalee而 Crawlee 的底层又是基于 beautifulsoup\playwright\httpx等大家耳熟能详的流行库……但传统的爬虫工具都是面向开发者的需要开发者手动去探索目标站点的结构分析出要提取元素的 xpath 等,这不仅阻挡了普通用户,同时也毫无通用性可言,即对于不同网站(包括已有网站升级)都需要重做分析和探索。这个问题在 LLM 出现之前是无解的而wiseflow致力的方向即是使用 LLM 自动化目标站点的分析和探索工作,从而实现“普通用户也可使用的通用爬虫”,从这个角度来说,你可以简单理解 wiseflow 为 “能自动使用爬虫工具的 AI 智能体” |
| **AI搜索** | AI搜索主要的应用场景是**具体问题的即时问答**举例”XX公司的创始人是谁“、“xx品牌下的xx产品哪里有售” ,用户要的是**一个答案**wiseflow主要的应用场景是**某一方面信息的持续采集**比如XX公司的关联信息追踪XX品牌市场行为的持续追踪……在这些场景下用户能提供关注点(某公司、某品牌)、甚至能提供信源(站点 url 等),但无法提出具体搜索问题,用户要的是**一系列相关信息**|
| **知识库RAG类项目** | 知识库RAG类项目一般是基于已有信息的下游任务并且一般面向的是私有知识比如企业内的操作手册、产品手册、政府部门的文件等wiseflow 目前并未整合下游任务,同时面向的是互联网上的公开信息如果从“智能体”的角度来看二者属于为不同目的而构建的智能体RAG 类项目是“(内部)知识助理智能体”,而 wiseflow 则是“(外部)信息采集智能体”|
## 🔄 V0.3.1 更新

View File

@ -5,12 +5,9 @@ services:
image: wiseflow:latest
tty: true
stdin_open: true
entrypoint: bash docker_entrypoint.sh
env_file:
- .env
entrypoint: ["bash", "/app/run_all.sh"]
ports:
- 8090:8090
- 8077:8077
volumes:
- ./${PROJECT_DIR}/pb_data:/app/pb/pb_data
- ./${PROJECT_DIR}:/app/${PROJECT_DIR}
- ./core:/app

0
core/agents/__init__.py Normal file
View File

195
core/agents/get_info.py Normal file
View File

@ -0,0 +1,195 @@
from core.llms.openai_wrapper import openai_llm as llm
# from core.llms.siliconflow_wrapper import sfa_llm
import re
from core.utils.general_utils import is_chinese, extract_and_convert_dates, extract_urls
from loguru import logger
from core.utils.pb_api import PbTalker
import os
from datetime import datetime, date
from urllib.parse import urlparse
import json_repair
class GeneralInfoExtractor:
def __init__(self, pb: PbTalker, _logger: logger) -> None:
self.pb = pb
self.logger = _logger
self.model = os.environ.get("PRIMARY_MODEL", "Qwen/Qwen2.5-7B-Instruct") # better to use "Qwen/Qwen2.5-14B-Instruct"
self.secondary_model = os.environ.get("SECONDARY_MODEL", "THUDM/glm-4-9b-chat")
# collect tags user set in pb database and determin the system prompt language based on tags
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
if not focus_data:
self.logger.info('no activated tag found, will ask user to create one')
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
'so please input one now. describe what info you care about shortly: ')
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
focus_data.append({"name": focus, "explaination": explanation,
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
self.focus_list = [item["focuspoint"] for item in focus_data]
self.focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
focus_statement = ''
for item in focus_data:
tag = item["name"]
expl = item["explaination"]
focus_statement = f"{focus_statement}#{tag}\n"
if expl:
focus_statement = f"{focus_statement}解释:{expl}\n"
if is_chinese(focus_statement):
self.get_info_prompt = f'''作为信息提取助手,你的任务是从给定的网页文本中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下:
{focus_statement}\n
在进行信息提取时请遵循以下原则
- 理解每个兴趣点的含义确保提取的内容与之相关
- 如果兴趣点有进一步的解释确保提取的内容符合这些解释的范围
- 忠于原文你的任务是从网页文本中识别和提取与各个兴趣点相关的信息并不是总结和提炼
- 不管给定的原文是何种语言请保证使用中文输出你的提取结果
另外请注意给定的网页文本是通过爬虫程序从html代码中提取出来的所以请忽略里面不必要的空格换行符等'''
self.get_info_suffix = '''如果上述网页文本中包含兴趣点相关的内容请按照以下json格式输出提取的信息文本中可能包含多条有用信息请不要遗漏
[{"focus": 兴趣点名称, "content": 提取的内容}]
示例
[{"focus": "旅游景点", "content": "北京故宫地址北京市东城区景山前街4号开放时间8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
如果网页文本中不包含任何与兴趣点相关的信息请仅输出[]'''
self.get_more_link_prompt = f"作为一位高效的信息筛选助手你的任务是根据给定的兴趣点从给定的文本及其对应的URL中挑选出最值得关注的URL。兴趣点及其解释如下\n\n{focus_statement}"
self.get_more_link_suffix = "请逐条分析上述 文本url 对。首先输出你的分析依据,然后给出是否挑选它的结论,如果决定挑选该条,在结论后复制输出该条的 url否则的话直接进入下一条的分析。请一条一条的分析不要漏掉任何一条。"
else:
self.get_info_prompt = f'''As an information extraction assistant, your task is to extract content related to the following user focus points from the given web page text. The list of focus points and their explanations is as follows:
{focus_statement}\n
When extracting information, please follow the principles below:
- Understand the meaning of each focus point and ensure that the extracted content is relevant to it.
- If a focus point has further explanations, ensure that the extracted content conforms to the scope of these explanations.
- Stay true to the original text; your task is to identify and extract information related to each focus point from the web page text, not to summarize or refine it.
Please note that the given web page text is extracted from HTML code via a crawler, so please ignore any unnecessary spaces, line breaks, etc.'''
self.get_info_suffix = '''If the above webpage text contains content related to points of interest, please output the extracted information in the following JSON format (the text may contain multiple useful pieces of information, do not miss any):
[{"focus": "Point of Interest Name", "content": "Extracted Content"}]
Example:
[{"focus": "Tourist Attraction", "content": "The Forbidden City, Beijing, Address: No. 4 Jingshan Front Street, Dongcheng District, Opening Hours: 8:30-17:00"}, {"focus": "Food Recommendation", "content": "Must-try at Wangfujing Snack Street: Beijing Roast Duck, Noodles with Soybean Paste"}]
If the webpage text does not contain any information related to points of interest, please output only: []'''
self.get_more_link_prompt = f"As an efficient information filtering assistant, your task is to select the most noteworthy URLs from a set of texts and their corresponding URLs based on the given focus points. The focus points and their explanations are as follows:\n\n{focus_statement}"
self.get_more_link_suffix = "Please analyze the above text: URL pairs. First, output your analysis basis, and then give the conclusion on whether to select it. If you decide to select this item, then copy and output the URL of this item following the conclusion; otherwise, proceed directly to the analysis of the next item. Analyze one by one, do not miss any one."
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
suffix = '''Please output the extracted information in the following JSON format:
{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}'''
content = f'<text>\n{text}\n</text>\n\n{suffix}'
llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
model=self.secondary_model, max_tokens=50, temperature=0.1, response_format={"type": "json_object"})
self.logger.debug(f'get_author_and_publish_date llm output:\n{llm_output}')
if not llm_output:
return '', ''
result = json_repair.repair_json(llm_output, return_objects=True)
self.logger.debug(f"decoded_object: {result}")
if not isinstance(result, dict):
self.logger.debug("failed to parse from llm output")
return '', ''
if 'source' not in result or 'publish_date' not in result:
self.logger.debug("failed to parse from llm output")
return '', ''
return result['source'], result['publish_date']
async def get_more_related_urls(self, link_dict: dict) -> set[str]:
if not link_dict:
return set()
content = ''
for key, value in link_dict.items():
content = f"{content}{key}: {value}\n"
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt}, {'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
model=self.secondary_model, temperature=0.1)
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
urls = extract_urls(result)
raw_urls = list(link_dict.values())
for url in urls:
if url not in raw_urls:
self.logger.debug(f"{url} not in link_dict, it's model's Hallucination")
urls.remove(url)
return urls
async def get_info(self, text: str, domain: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
content = f'<text>\n{text}\n</text>\n\n{self.get_info_suffix}'
result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}],
model=self.model, temperature=0.1, response_format={"type": "json_object"})
domain = urlparse(url).netloc
def get_info(article_content: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
model=get_info_model, logger=logger, temperature=0.1)
# results = pattern.findall(result)
texts = result.split('<tag>')
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
if not texts:
logger.debug(f'can not find info, llm result:\n{result}')
return []
cache = []
for text in texts:
try:
strings = text.split('</tag>')
tag = strings[0]
tag = tag.strip()
if tag not in focus_list:
logger.info(f'tag not in focus_list: {tag}, aborting')
continue
info = strings[1]
info = info.split('\n\n')
info = info[0].strip()
except Exception as e:
logger.info(f'parse error: {e}')
tag = ''
info = ''
if not info or not tag:
logger.info(f'parse failed-{text}')
continue
if len(info) < 7:
logger.info(f'info too short, possible invalid: {info}')
continue
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
logger.info(f'no relevant info: {text}')
continue
while info.endswith('"'):
info = info[:-1]
info = info.strip()
# 拼接下来源信息
sources = re.findall(r'\[from (.*?)]', article_content)
if sources and sources[0]:
info = f"[from {sources[0]}] {info}"
cache.append({'content': info, 'tag': focus_dict[tag]})
return cache

0
core/agents/insights.py Normal file
View File

5
core/agents/seeact.py Normal file
View File

@ -0,0 +1,5 @@
# future plan
# inspired by https://github.com/OSU-NLP-Group/SeeAct
# use a visual-llm to extract the main content and determine next action
# input a playwright page object

View File

@ -1,45 +0,0 @@
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from typing import Literal, Optional
from fastapi.middleware.cors import CORSMiddleware
from insights import message_manager
class Request(BaseModel):
"""
Input model
input = {'user_id': str, 'type': str, 'content':str 'addition': Optional[str]}
Type is one of "text", "publicMsg", "site" and "url"
"""
user_id: str
type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"]
content: str
addition: Optional[str] = None
app = FastAPI(
title="WiseFlow Union Backend",
description="From Wiseflow Team.",
version="0.3.1",
openapi_url="/openapi.json"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
def read_root():
msg = "Hello, this is Wise Union Backend, version 0.3.1"
return {"msg": msg}
@app.post("/feed")
async def call_to_feed(background_tasks: BackgroundTasks, request: Request):
background_tasks.add_task(message_manager, _input=request.model_dump())
return {"msg": "received well"}

View File

@ -1,4 +0,0 @@
#!/bin/bash
exec pb/pocketbase serve --http=0.0.0.0:8090 &
exec python tasks.py &
exec uvicorn backend:app --reload --host 0.0.0.0 --port 8077

View File

@ -1,162 +0,0 @@
# -*- coding: utf-8 -*-
from scrapers.general_crawler import general_crawler
from utils.general_utils import extract_urls, compare_phrase_with_list
from .get_info import get_info, pb, project_dir, logger, info_rewrite
import os
import json
from datetime import datetime, timedelta
import re
import asyncio
from typing import Dict
# The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
item_pattern = re.compile(r'<item>(.*?)</item>', re.DOTALL)
url_pattern = re.compile(r'<url><!\[CDATA\[(.*?)]]></url>')
summary_pattern = re.compile(r'<summary><!\[CDATA\[(.*?)]]></summary>', re.DOTALL)
extensions = ('.pdf', '.docx', '.xlsx', '.doc', '.ppt', '.pptx', '.xls', '.txt', '.jpg', '.jpeg', '.png', '.gif', '.bmp',
'.tiff', '.mp4', '.avi', '.wmv', '.mkv', '.flv', '.wav', '.mp3', '.avi', '.mov', '.wmv', '.mpeg', '.mpg',
'.3gp', '.ogg', '.webm', '.m4a', '.aac', '.flac', '.wma', '.amr', '.ogg', '.m4v', '.m3u8', '.m3u', '.ts',
'.mts')
expiration_days = 3
existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']}
async def pipeline(url: str, cache: Dict[str, str] = {}):
working_list = {url}
while working_list:
url = working_list.pop()
existing_urls.add(url)
if any(url.endswith(ext) for ext in extensions):
logger.info(f"{url} is a file, skip")
continue
logger.debug(f"start processing {url}")
# get article process
flag, result = await general_crawler(url, logger)
if flag == 1:
logger.info('get new url list, add to work list')
new_urls = result - existing_urls
working_list.update(new_urls)
continue
elif flag <= 0:
logger.error("got article failed, pipeline abort")
continue
expiration = datetime.now() - timedelta(days=expiration_days)
expiration_date = expiration.strftime('%Y-%m-%d')
article_date = int(result['publish_time'])
if article_date < int(expiration_date.replace('-', '')):
logger.info(f"publish date is {article_date}, too old, skip")
continue
for k, v in cache.items():
if v:
result[k] = v
# get info process
logger.debug(f"article: {result['title']}")
article_id = pb.add(collection_name='articles', body=result)
if not article_id:
logger.error('add article failed, writing to cache_file')
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=4)
continue
insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
if not insights:
continue
# post process
article_tags = set()
old_insights = pb.read(collection_name='insights', filter=f"updated>'{expiration_date}'",
fields=['id', 'tag', 'content', 'articles'])
for insight in insights:
article_tags.add(insight['tag'])
insight['articles'] = [article_id]
old_insight_dict = {i['content']: i for i in old_insights if i['tag'] == insight['tag']}
# the result wanted is whether the extracted information phrases are talking about the same thing,
# it may not be suitable and too heavy to calculate the similarity with a vector model
# Therefore, a simplified solution is used here, directly using jieba to calculate whether the overlap between the two phrases exceeds.
similar_insights = compare_phrase_with_list(insight['content'], list(old_insight_dict.keys()), 0.65)
if similar_insights:
to_rewrite = similar_insights + [insight['content']]
new_info_content = info_rewrite(to_rewrite)
if not new_info_content:
continue
insight['content'] = new_info_content
# Merge related articles and delete old insights
for old_insight in similar_insights:
insight['articles'].extend(old_insight_dict[old_insight]['articles'])
if not pb.delete(collection_name='insights', id=old_insight_dict[old_insight]['id']):
logger.error('delete insight failed')
old_insights.remove(old_insight_dict[old_insight])
insight['id'] = pb.add(collection_name='insights', body=insight)
if not insight['id']:
logger.error('add insight failed, writing to cache_file')
with open(os.path.join(project_dir, 'cache_insights.json'), 'a', encoding='utf-8') as f:
json.dump(insight, f, ensure_ascii=False, indent=4)
_ = pb.update(collection_name='articles', id=article_id, body={'tag': list(article_tags)})
if not _:
logger.error(f'update article failed - article_id: {article_id}')
result['tag'] = list(article_tags)
with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=4)
async def message_manager(_input: dict):
source = _input['user_id']
logger.debug(f"received new task, user: {source}, Addition info: {_input['addition']}")
if _input['type'] == 'publicMsg':
items = item_pattern.findall(_input["content"])
# Iterate through all < item > content, extracting < url > and < summary >
for item in items:
url_match = url_pattern.search(item)
url = url_match.group(1) if url_match else None
if not url:
logger.warning(f"can not find url in \n{item}")
continue
# URL processing, http is replaced by https, and the part after chksm is removed.
url = url.replace('http://', 'https://')
cut_off_point = url.find('chksm=')
if cut_off_point != -1:
url = url[:cut_off_point-1]
if url in existing_urls:
logger.debug(f"{url} has been crawled, skip")
continue
summary_match = summary_pattern.search(item)
summary = summary_match.group(1) if summary_match else None
cache = {'source': source, 'abstract': summary}
await pipeline(url, cache)
elif _input['type'] == 'text':
urls = extract_urls(_input['content'])
if not urls:
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
# todo get info from text process
return
await asyncio.gather(*[pipeline(url) for url in urls if url not in existing_urls])
elif _input['type'] == 'url':
# this is remained for wechat shared mp_article_card
item = re.search(r'<url>(.*?)&amp;chksm=', _input["content"], re.DOTALL)
if not item:
logger.debug("shareUrlOpen not find")
item = re.search(r'<shareUrlOriginal>(.*?)&amp;chksm=', _input["content"], re.DOTALL)
if not item:
logger.debug("shareUrlOriginal not find")
item = re.search(r'<shareUrlOpen>(.*?)&amp;chksm=', _input["content"], re.DOTALL)
if not item:
logger.warning(f"cannot find url in \n{_input['content']}")
return
extract_url = item.group(1).replace('amp;', '')
summary_match = re.search(r'<des>(.*?)</des>', _input["content"], re.DOTALL)
summary = summary_match.group(1) if summary_match else None
cache = {'source': source, 'abstract': summary}
await pipeline(extract_url, cache)
else:
return

View File

@ -1,151 +0,0 @@
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
import re
from utils.general_utils import get_logger_level, is_chinese
from loguru import logger
from utils.pb_api import PbTalker
import os
get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-4o-mini-2024-07-18")
rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-4o-mini-2024-07-18")
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'wiseflow.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
if not focus_data:
logger.error('no activated tag found, please set at least one')
exit(1)
focus_list = [item["name"] for item in focus_data if item["name"]]
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
lang_term = ''.join([f'{item["name"]}{item["explaination"]}' for item in focus_data if item["name"]])
focus_statement = '\n'.join([f'<tag>{item["name"]}</tag>{item["explaination"]}' for item in focus_data if item["name"] and item["explaination"]])
if is_chinese(lang_term):
if focus_statement:
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型标签列表进行分析。类型标签列表如下:
{focus_list}
各标签的含义如下
{focus_statement}
如果新闻中包含上述任何类型的信息请使用以下格式标记信息的类型标签并提供仅包含时间地点人物和事件的一句话信息摘要
<tag>类型名称</tag>仅包含时间地点人物和事件的一句话信息摘要
务必注意1严格忠于新闻原文不得提供原文中不包含的信息2对于同一事件仅选择一个最贴合的标签不要重复输出3如果新闻中包含多个信息请逐一分析并按一条一行的格式输出如果新闻不涉及任何类型的信息则直接输出'''
else:
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型标签列表进行分析。类型标签列表如下:
{focus_list}
如果新闻中包含上述任何类型的信息请使用以下格式标记信息的类型标签并提供仅包含时间地点人物和事件的一句话信息摘要
<tag>类型名称</tag>仅包含时间地点人物和事件的一句话信息摘要
务必注意1严格忠于新闻原文不得提供原文中不包含的信息2对于同一事件仅选择一个最贴合的标签不要重复输出3如果新闻中包含多个信息请逐一分析并按一条一行的格式输出如果新闻不涉及任何类型的信息则直接输出'''
rewrite_prompt = '''请综合给到的内容提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要不要输出其他的信息。'''
else:
if focus_statement:
system_prompt = f'''Please carefully read the news content provided by the user and analyze it according to the list of type labels given below:
{focus_list}
The meanings of each label are as follows:
{focus_statement}
If the news contains any information of the aforementioned types, please mark the type label of the information using the following format and provide a one-sentence summary containing only the time, location, people involved, and event:
<tag>TypeLabel</tag>A one-sentence summary containing only the time, location, people involved, and event
Please be sure to: 1. Strictly adhere to the original text and do not provide information not contained in the original; 2. For the same event, choose only one most appropriate label and do not repeat the output; 3. If the news contains multiple pieces of information, analyze them one by one and output them in a one-line-per-item format. If the news does not involve any of the types of information, simply output: None.'''
else:
system_prompt = f'''Please carefully read the news content provided by the user and analyze it according to the list of type labels given below:
{focus_list}
If the news contains any information of the aforementioned types, please mark the type label of the information using the following format and provide a one-sentence summary containing only the time, location, people involved, and event:
<tag>TypeLabel</tag>A one-sentence summary containing only the time, location, people involved, and event
Please be sure to: 1. Strictly adhere to the original text and do not provide information not contained in the original; 2. For the same event, choose only one most appropriate label and do not repeat the output; 3. If the news contains multiple pieces of information, analyze them one by one and output them in a one-line-per-item format. If the news does not involve any of the types of information, simply output: None.'''
rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information."
def get_info(article_content: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
model=get_info_model, logger=logger, temperature=0.1)
# results = pattern.findall(result)
texts = result.split('<tag>')
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
if not texts:
logger.debug(f'can not find info, llm result:\n{result}')
return []
cache = []
for text in texts:
try:
strings = text.split('</tag>')
tag = strings[0]
tag = tag.strip()
if tag not in focus_list:
logger.info(f'tag not in focus_list: {tag}, aborting')
continue
info = strings[1]
info = info.split('\n\n')
info = info[0].strip()
except Exception as e:
logger.info(f'parse error: {e}')
tag = ''
info = ''
if not info or not tag:
logger.info(f'parse failed-{text}')
continue
if len(info) < 7:
logger.info(f'info too short, possible invalid: {info}')
continue
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
logger.info(f'no relevant info: {text}')
continue
while info.endswith('"'):
info = info[:-1]
info = info.strip()
# 拼接下来源信息
sources = re.findall(r'\[from (.*?)]', article_content)
if sources and sources[0]:
info = f"[from {sources[0]}] {info}"
cache.append({'content': info, 'tag': focus_dict[tag]})
return cache
def info_rewrite(contents: list[str]) -> str:
context = f"<content>{'</content><content>'.join(contents)}</content>"
try:
result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
model=rewrite_model, temperature=0.1, logger=logger)
return result.strip()
except Exception as e:
if logger:
logger.warning(f'rewrite process llm generate failed: {e}')
else:
print(f'rewrite process llm generate failed: {e}')
return ''

View File

@ -1,7 +1,7 @@
import os
from openai import OpenAI
from openai import RateLimitError
import time
import asyncio
base_url = os.environ.get('LLM_API_BASE', "")
@ -10,34 +10,36 @@ token = os.environ.get('LLM_API_KEY', "")
if not base_url and not token:
raise ValueError("LLM_API_BASE or LLM_API_KEY must be set")
elif base_url and not token:
client = OpenAI(base_url=base_url)
client = OpenAI(base_url=base_url, api_key="not_use")
elif not base_url and token:
client = OpenAI(api_key=token)
else:
client = OpenAI(api_key=token, base_url=base_url)
llm_lock = asyncio.Lock()
def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
if logger:
logger.debug(f'messages:\n {messages}')
logger.debug(f'model: {model}')
logger.debug(f'kwargs:\n {kwargs}')
try:
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
except RateLimitError as e:
logger.warning(f'{e}\nRetrying in 60 second...')
time.sleep(60)
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
if response and response.choices:
return response.choices[0].message.content
else:
logger.error(f'after many try, llm error: {response}')
return ""
except Exception as e:
if logger:
logger.error(f'openai_llm error: {e}')
return ''
async with llm_lock:
try:
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
except RateLimitError as e:
logger.warning(f'{e}\nRetrying in 60 second...')
await asyncio.sleep(60)
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
if response.status_code == 200 and response.choices:
return response.choices[0].message.content
else:
logger.error(f'after many try, llm error: {response}')
return ""
except Exception as e:
if logger:
logger.error(f'openai_llm error: {e}')
return ''
if logger:
logger.debug(f'result:\n {response.choices[0]}')

View File

@ -1,74 +0,0 @@
routerAdd(
"POST",
"/save",
(c) => {
const data = $apis.requestInfo(c).data
// console.log(data)
let dir = $os.getenv("PROJECT_DIR")
if (dir) {
dir = dir + "/"
}
// console.log(dir)
const collection = $app.dao().findCollectionByNameOrId("documents")
const record = new Record(collection)
const form = new RecordUpsertForm($app, record)
// or form.loadRequest(request, "")
form.loadData({
workflow: data.workflow,
insight: data.insight,
task: data.task,
})
// console.log(dir + data.file)
const f1 = $filesystem.fileFromPath(dir + data.file)
form.addFiles("files", f1)
form.submit()
return c.json(200, record)
},
$apis.requireRecordAuth()
)
routerAdd(
"GET",
"/insight_dates",
(c) => {
let result = arrayOf(
new DynamicModel({
created: "",
})
)
$app.dao().db().newQuery("SELECT DISTINCT DATE(created) as created FROM insights").all(result)
return c.json(
200,
result.map((r) => r.created)
)
},
$apis.requireAdminAuth()
)
routerAdd(
"GET",
"/article_dates",
(c) => {
let result = arrayOf(
new DynamicModel({
created: "",
})
)
$app.dao().db().newQuery("SELECT DISTINCT DATE(created) as created FROM articles").all(result)
return c.json(
200,
result.map((r) => r.created)
)
},
$apis.requireAdminAuth()
)

View File

@ -1,55 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "bc3g5s66bcq1qjp",
"created": "2024-04-07 00:31:40.644Z",
"updated": "2024-04-07 00:31:40.644Z",
"name": "article_translation",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "t2jqr7cs",
"name": "title",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "dr9kt3dn",
"name": "abstract",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp");
return dao.deleteCollection(collection);
})

View File

@ -1,154 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "lft7642skuqmry7",
"created": "2024-04-07 00:33:32.746Z",
"updated": "2024-04-07 00:33:32.746Z",
"name": "articles",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "yttga2xi",
"name": "title",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "99dnnabt",
"name": "url",
"type": "url",
"required": true,
"presentable": false,
"unique": false,
"options": {
"exceptDomains": [],
"onlyDomains": []
}
},
{
"system": false,
"id": "itplfdwh",
"name": "abstract",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "iorna912",
"name": "content",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "judmyhfm",
"name": "publish_time",
"type": "number",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"noDecimal": false
}
},
{
"system": false,
"id": "um6thjt5",
"name": "author",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "kvzodbm3",
"name": "images",
"type": "json",
"required": false,
"presentable": false,
"unique": false,
"options": {
"maxSize": 2000000
}
},
{
"system": false,
"id": "eviha2ho",
"name": "snapshot",
"type": "file",
"required": false,
"presentable": false,
"unique": false,
"options": {
"mimeTypes": [],
"thumbs": [],
"maxSelect": 1,
"maxSize": 5242880,
"protected": false
}
},
{
"system": false,
"id": "tukuros5",
"name": "translation_result",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "bc3g5s66bcq1qjp",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": 1,
"displayFields": null
}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7");
return dao.deleteCollection(collection);
})

View File

@ -1,52 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "tmwf6icx",
"name": "raw",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "lft7642skuqmry7",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": 1,
"displayFields": null
}
}))
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "hsckiykq",
"name": "content",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// remove
collection.schema.removeField("tmwf6icx")
// remove
collection.schema.removeField("hsckiykq")
return dao.saveCollection(collection)
})

View File

@ -1,73 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "h3c6pqhnrfo4oyf",
"created": "2024-04-07 00:40:42.781Z",
"updated": "2024-04-07 00:40:42.781Z",
"name": "insights",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "5hp4ulnc",
"name": "content",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "gsozubhx",
"name": "articles",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "lft7642skuqmry7",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
},
{
"system": false,
"id": "iiwkyzr2",
"name": "docx",
"type": "file",
"required": false,
"presentable": false,
"unique": false,
"options": {
"mimeTypes": [],
"thumbs": [],
"maxSelect": 1,
"maxSize": 5242880,
"protected": false
}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf");
return dao.deleteCollection(collection);
})

View File

@ -1,54 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "sma08jpi5rkoxnh",
"created": "2024-04-17 02:52:04.291Z",
"updated": "2024-04-17 02:52:04.291Z",
"name": "sites",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "6qo4l7og",
"name": "url",
"type": "url",
"required": false,
"presentable": false,
"unique": false,
"options": {
"exceptDomains": null,
"onlyDomains": null
}
},
{
"system": false,
"id": "lgr1quwi",
"name": "per_hours",
"type": "number",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": 1,
"max": 24,
"noDecimal": false
}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh");
return dao.deleteCollection(collection);
})

View File

@ -1,74 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "6qo4l7og",
"name": "url",
"type": "url",
"required": true,
"presentable": false,
"unique": false,
"options": {
"exceptDomains": null,
"onlyDomains": null
}
}))
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "lgr1quwi",
"name": "per_hours",
"type": "number",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": 1,
"max": 24,
"noDecimal": false
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "6qo4l7og",
"name": "url",
"type": "url",
"required": false,
"presentable": false,
"unique": false,
"options": {
"exceptDomains": null,
"onlyDomains": null
}
}))
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "lgr1quwi",
"name": "per_hours",
"type": "number",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": 1,
"max": 24,
"noDecimal": false
}
}))
return dao.saveCollection(collection)
})

View File

@ -1,27 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "8x8n2a47",
"name": "activated",
"type": "bool",
"required": false,
"presentable": false,
"unique": false,
"options": {}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh")
// remove
collection.schema.removeField("8x8n2a47")
return dao.saveCollection(collection)
})

View File

@ -1,44 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "iorna912",
"name": "content",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "iorna912",
"name": "content",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
})

View File

@ -1,31 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "d13734ez",
"name": "tag",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// remove
collection.schema.removeField("d13734ez")
return dao.saveCollection(collection)
})

View File

@ -1,31 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "pwy2iz0b",
"name": "source",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// remove
collection.schema.removeField("pwy2iz0b")
return dao.saveCollection(collection)
})

View File

@ -1,51 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "nvf6k0yoiclmytu",
"created": "2024-05-16 01:36:01.108Z",
"updated": "2024-05-16 01:36:01.108Z",
"name": "tags",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "l6mm7m90",
"name": "activated",
"type": "bool",
"required": false,
"presentable": false,
"unique": false,
"options": {}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu");
return dao.deleteCollection(collection);
})

View File

@ -1,52 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// remove
collection.schema.removeField("d13734ez")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "d13734ez",
"name": "tag",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
// remove
collection.schema.removeField("j65p3jji")
return dao.saveCollection(collection)
})

View File

@ -1,16 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.listRule = null
return dao.saveCollection(collection)
})

View File

@ -1,16 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -1,33 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "8d9woe75",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("_pb_users_auth_")
// remove
collection.schema.removeField("8d9woe75")
return dao.saveCollection(collection)
})

View File

@ -1,33 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "famdh2fv",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
// remove
collection.schema.removeField("famdh2fv")
return dao.saveCollection(collection)
})

View File

@ -1,18 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("lft7642skuqmry7")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -1,33 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "lbxw5pra",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
// remove
collection.schema.removeField("lbxw5pra")
return dao.saveCollection(collection)
})

View File

@ -1,18 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each"
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -1,44 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": true,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "0th8uax4",
"name": "name",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
})

View File

@ -1,48 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": 1,
"displayFields": null
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf")
// update
collection.schema.addField(new SchemaField({
"system": false,
"id": "j65p3jji",
"name": "tag",
"type": "relation",
"required": false,
"presentable": false,
"unique": false,
"options": {
"collectionId": "nvf6k0yoiclmytu",
"cascadeDelete": false,
"minSelect": null,
"maxSelect": null,
"displayFields": null
}
}))
return dao.saveCollection(collection)
})

View File

@ -1,18 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
collection.listRule = "@request.auth.id != \"\""
collection.viewRule = "@request.auth.id != \"\""
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
collection.listRule = null
collection.viewRule = null
return dao.saveCollection(collection)
})

View File

@ -1,31 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// add
collection.schema.addField(new SchemaField({
"system": false,
"id": "vkgtujiz",
"name": "explaination",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}))
return dao.saveCollection(collection)
}, (db) => {
const dao = new Dao(db)
const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu")
// remove
collection.schema.removeField("vkgtujiz")
return dao.saveCollection(collection)
})

View File

@ -1,56 +0,0 @@
We provide a general page parser that can intelligently retrieve article lists from sources. For each article URL, it first attempts to use `gne` for parsing, and if that fails, it will try using `llm`.
This solution allows scanning and extracting information from most general news and portal sources.
**However, we strongly recommend that users develop custom parsers for specific sources tailored to their actual business scenarios for more ideal and efficient scanning.**
We also provide a parser specifically for WeChat public articles (mp.weixin.qq.com).
**If you are willing to contribute your custom source-specific parsers to this repository, we would greatly appreciate it!**
## Custom Source Parser Development Specifications
### Specifications
**Remember It should be an asynchronous function**
1. **The parser should be able to intelligently distinguish between article list pages and article detail pages.**
2. **The parser's input parameters should only include `url` and `logger`:**
- `url` is the complete address of the source (type `str`).
- `logger` is the logging object (please do not configure a separate logger for your custom source parser).
3. **The parser's output should include `flag` and `result`, formatted as `tuple[int, Union[set, dict]]`:**
- If the `url` is an article list page, `flag` returns `1`, and `result` returns a tuple of all article page URLs (`set`).
- If the `url` is an article page, `flag` returns `11`, and `result` returns all article details (`dict`), in the following format:
```python
{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]}
```
_Note: `title` and `content` cannot be empty._
**Note: `publish_time` should be in the format `"%Y%m%d"` (date only, no `-`). If the scraper cannot fetch it, use the current date.**
- If parsing fails, `flag` returns `0`, and `result` returns an empty dictionary `{}`.
_`pipeline` will try other parsing solutions (if any) upon receiving `flag` 0._
- If page retrieval fails (e.g., network issues), `flag` returns `-7`, and `result` returns an empty dictionary `{}`.
_`pipeline` will not attempt to parse again in the same process upon receiving `flag` -7._
### Registration
After writing your scraper, place the scraper program in this folder and register the scraper in `scraper_map` under `__init__.py`, similar to:
```python
{'domain': 'crawler def name'}
```
It is recommended to use urllib.parse to get the domain:
```python
from urllib.parse import urlparse
parsed_url = urlparse("site's url")
domain = parsed_url.netloc
```

View File

@ -1,56 +0,0 @@
我们提供了一个通用页面解析器,该解析器可以智能获取信源文章列表。对于每个文章 URL会先尝试使用 `gne` 进行解析,如果失败,再尝试使用 `llm` 进行解析。
通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
**然而,我们依然强烈建议用户根据实际业务场景编写针对特定信源的专有解析器,以实现更理想且高效的扫描。**
此外我们提供了一个专门针对微信公众号文章mp.weixin.qq.com的解析器。
**如果您愿意将您撰写的特定信源专有解析器贡献至本代码仓库,我们将不胜感激!**
## 专有信源解析器开发规范
### 规范
**记住:这应该是一个异步函数**
1. **解析器应能智能区分文章列表页面和文章详情页面。**
2. **解析器入参只包括 `url``logger` 两项:**
- `url` 是信源完整地址(`str` 类型)
- `logger` 是日志对象(请勿为您的专有信源解析器单独配置 `logger`
3. **解析器出参包括 `flag``result` 两项,格式为 `tuple[int, Union[set, dict]]`**
- 如果 `url` 是文章列表页面,`flag` 返回 `1``result` 返回解析出的全部文章页面 URL 集合(`set`)。
- 如果 `url` 是文章页面,`flag` 返回 `11``result` 返回解析出的全部文章详情(`dict`),格式如下:
```python
{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]}
```
_注意`title` 和 `content` 两项不能为空。_
**注意:`publish_time` 格式为 `"%Y%m%d"`(仅日期,没有 `-`),如果爬虫抓不到可以用当天日期。**
- 如果解析失败,`flag` 返回 `0``result` 返回空字典 `{}`
_`pipeline` 收到 `flag` 0 会尝试其他解析方案如有。_
- 如果页面获取失败(如网络问题),`flag` 返回 `-7``result` 返回空字典 `{}`
_`pipeline` 收到 `flag` -7 同一进程内不会再次尝试解析。_
### 注册
写好爬虫后,将爬虫程序放在该文件夹,并在 `__init__.py` 下的 `scraper_map` 中注册爬虫,类似:
```python
{'domain': 'crawler def name'}
```
建议使用 urllib.parse 获取 domain
```python
from urllib.parse import urlparse
parsed_url = urlparse("site's url")
domain = parsed_url.netloc
```

View File

@ -1,56 +0,0 @@
Wir bieten einen allgemeinen Seitenparser an, der intelligent Artikellisten von Quellen abrufen kann. Für jede Artikel-URL wird zuerst versucht, `gne` zur Analyse zu verwenden. Falls dies fehlschlägt, wird `llm` als Alternative genutzt.
Diese Lösung ermöglicht das Scannen und Extrahieren von Informationen aus den meisten allgemeinen Nachrichtenquellen und Portalen.
**Wir empfehlen jedoch dringend, benutzerdefinierte Parser für spezifische Quellen zu entwickeln, die auf Ihre tatsächlichen Geschäftsszenarien abgestimmt sind, um eine idealere und effizientere Erfassung zu erreichen.**
Wir stellen auch einen speziellen Parser für WeChat-Artikel (mp.weixin.qq.com) bereit.
**Falls Sie bereit sind, Ihre speziell entwickelten Parser für bestimmte Quellen zu diesem Code-Repository beizutragen, wären wir Ihnen sehr dankbar!**
## Entwicklungsspezifikationen für benutzerdefinierte Quellparser
### Spezifikationen
**Denken Sie daran: Es sollte eine asynchrone Funktion sein**
1. **Der Parser sollte in der Lage sein, intelligent zwischen Artikel-Listen-Seiten und Artikel-Detailseiten zu unterscheiden.**
2. **Die Eingabeparameter des Parsers sollten nur `url` und `logger` umfassen:**
- `url` ist die vollständige Adresse der Quelle (Typ `str`).
- `logger` ist das Protokollierungsobjekt (bitte konfigurieren Sie keinen separaten Logger für Ihren benutzerdefinierten Quellparser).
3. **Die Ausgabe des Parsers sollte `flag` und `result` umfassen, im Format `tuple[int, Union[set, dict]]`:**
- Wenn die `url` eine Artikellisten-Seite ist, gibt `flag` `1` zurück, und `result` gibt eine satz aller Artikel-URLs (`set`) zurück.
- Wenn die `url` eine Artikelseite ist, gibt `flag` `11` zurück, und `result` gibt alle Artikeldetails (`dict`) zurück, im folgenden Format:
```python
{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]}
```
_Hinweis: `title` und `content` dürfen nicht leer sein._
**Hinweis: Das `publish_time`-Format muss `"%Y%m%d"` (nur Datum, ohne `-`) sein. Wenn der Scraper es nicht erfassen kann, verwenden Sie das aktuelle Datum.**
- Wenn die Analyse fehlschlägt, gibt `flag` `0` zurück, und `result` gibt ein leeres Wörterbuch `{}` zurück.
_Der `pipeline` versucht andere Analysemethoden (falls vorhanden), wenn `flag` 0 zurückgegeben wird._
- Wenn das Abrufen der Seite fehlschlägt (z. B. aufgrund von Netzwerkproblemen), gibt `flag` `-7` zurück, und `result` gibt ein leeres Wörterbuch `{}` zurück.
_Der `pipeline` wird im gleichen Prozess keine weiteren Versuche zur Analyse unternehmen, wenn `flag` -7 zurückgegeben wird._
### Registrierung
Nach dem Schreiben Ihres Scrapers platzieren Sie das Scraper-Programm in diesem Ordner und registrieren den Scraper in `scraper_map` in `__init__.py`, wie folgt:
```python
{'domain': 'Crawler-Funktionsname'}
```
Es wird empfohlen, urllib.parse zur Ermittlung der domain zu verwenden:
```python
from urllib.parse import urlparse
parsed_url = urlparse("l'URL du site")
domain = parsed_url.netloc
```

View File

@ -1,56 +0,0 @@
Nous proposons un analyseur de pages général capable de récupérer intelligemment les listes d'articles de sources d'information. Pour chaque URL d'article, il tente d'abord d'utiliser `gne` pour l'analyse, et en cas d'échec, il essaie d'utiliser `llm`.
Cette solution permet de scanner et d'extraire des informations de la plupart des sources de nouvelles générales et des portails d'information.
**Cependant, nous recommandons vivement aux utilisateurs de développer des analyseurs personnalisés pour des sources spécifiques en fonction de leurs scénarios d'affaires réels afin d'obtenir une analyse plus idéale et plus efficace.**
Nous fournissons également un analyseur spécialement conçu pour les articles publics WeChat (mp.weixin.qq.com).
**Si vous êtes disposé à contribuer vos analyseurs spécifiques à certaines sources à ce dépôt de code, nous vous en serions très reconnaissants !**
## Spécifications pour le Développement d'Analyseurs Spécifiques
### Spécifications
**N'oubliez pas : il devrait s'agir d'une fonction asynchrone**
1. **L'analyseur doit être capable de distinguer intelligemment entre les pages de liste d'articles et les pages de détail des articles.**
2. **Les paramètres d'entrée de l'analyseur doivent uniquement inclure `url` et `logger` :**
- `url` est l'adresse complète de la source (type `str`).
- `logger` est l'objet de journalisation (ne configurez pas de logger séparé pour votre analyseur spécifique).
3. **Les paramètres de sortie de l'analyseur doivent inclure `flag` et `result`, formatés comme `tuple[int, Union[set, dict]]` :**
- Si l'URL est une page de liste d'articles, `flag` renvoie `1` et `result` renvoie la set de toutes les URL des pages d'articles (`set`).
- Si l'URL est une page d'article, `flag` renvoie `11` et `result` renvoie tous les détails de l'article (`dict`), au format suivant :
```python
{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]}
```
_Remarque : `title` et `content` ne peuvent pas être vides._
**Remarque : `publish_time` doit être au format `"%Y%m%d"` (date uniquement, sans `-`). Si le scraper ne peut pas le récupérer, utilisez la date du jour.**
- En cas d'échec de l'analyse, `flag` renvoie `0` et `result` renvoie un dictionnaire vide `{}`.
_Le `pipeline` essaiera d'autres solutions d'analyse (si disponibles) après avoir reçu `flag` 0._
- En cas d'échec de la récupération de la page (par exemple, problème réseau), `flag` renvoie `-7` et `result` renvoie un dictionnaire vide `{}`.
_Le `pipeline` n'essaiera pas de réanalyser dans le même processus après avoir reçu `flag` -7._
### Enregistrement
Après avoir écrit votre scraper, placez le programme du scraper dans ce dossier et enregistrez le scraper dans `scraper_map` sous `__init__.py`, de manière similaire :
```python
{'domain': 'nom de la fonction de crawler'}
```
Il est recommandé d'utiliser urllib.parse pour obtenir le domain :
```python
from urllib.parse import urlparse
parsed_url = urlparse("l'URL du site")
domain = parsed_url.netloc
```

View File

@ -1,56 +0,0 @@
汎用ページパーサーを提供しており、このパーサーは信頼できるソースから記事リストをインテリジェントに取得します。各記事URLに対して、まず `gne` を使用して解析を試み、失敗した場合は `llm` を使用して解析します。
このソリューションにより、ほとんどの一般的なニュースサイトやポータルサイトからの情報をスキャンして抽出することができます。
**しかし、より理想的かつ効率的なスキャンを実現するために、ユーザー自身のビジネスシナリオに応じた特定のソース専用のパーサーを開発することを強くお勧めします。**
また、WeChat 公共アカウントの記事mp.weixin.qq.comに特化したパーサーも提供しています。
**特定のソース専用に開発したパーサーをこのリポジトリに貢献していただける場合は、大変感謝いたします!**
## 特定ソースパーサー開発規範
### 規範
**覚えておいてください:それは非同期関数でなければなりません**
1. **パーサーは、記事リストページと記事詳細ページをインテリジェントに区別できる必要があります。**
2. **パーサーの入力パラメーターは `url``logger` のみを含むべきです:**
- `url` はソースの完全なアドレス(`str` タイプ)
- `logger` はロギングオブジェクト(専用のロガーを構成しないでください)
3. **パーサーの出力は `flag``result` を含み、形式は `tuple[int, Union[set, dict]]`**
- `url` が記事リストページの場合、`flag` は `1` を返し、`result` はすべての記事ページURLのコレクション`set`)を返します。
- `url` が記事ページの場合、`flag` は `11` を返し、`result` はすべての記事詳細(`dict`)を返します。形式は以下の通りです:
```python
{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]}
```
_注意`title` と `content` は空であってはなりません。_
**注意:`publish_time` の形式は `"%Y%m%d"`(日付のみ、`-` はなし)である必要があります。スクレイパーが取得できない場合は、当日の日付を使用してください。**
- 解析に失敗した場合、`flag` は `0` を返し、`result` は空の辞書 `{}` を返します。
_`pipeline` は `flag` 0 を受け取ると他の解析ソリューション存在する場合を試みます。_
- ページの取得に失敗した場合(例えば、ネットワークの問題)、`flag` は `-7` を返し、`result` は空の辞書 `{}` を返します。
_`pipeline` は `flag` -7 を受け取ると、同一プロセス内では再解析を試みません。_
### 登録
スクレイパーを作成したら、このフォルダにプログラムを配置し、`__init__.py` の `scraper_map` にスクレイパーを次のように登録してください:
```python
{'domain': 'スクレイパー関数名'}
```
domain の取得には urllib.parse を使用することをお勧めします:
```python
from urllib.parse import urlparse
parsed_url = urlparse("l'URL du site")
domain = parsed_url.netloc
```

View File

@ -1,4 +0,0 @@
from .mp_crawler import mp_crawler
scraper_map = {'mp.weixin.qq.com': mp_crawler}

View File

@ -1,228 +0,0 @@
# -*- coding: utf-8 -*-
# when you use this general crawler, remember followings
# When you receive flag -7, it means that the problem occurs in the HTML fetch process.
# When you receive flag 0, it means that the problem occurred during the content parsing process.
# when you receive flag 1, the result would be a tuple, means that the input url is possible a article_list page
# and the set contains the url of the articles.
# when you receive flag 11, you will get the dict contains the title, content, url, date, and the source of the article.
from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from bs4.element import Comment
from utils.general_utils import extract_and_convert_dates
import asyncio
import json_repair
import os
from typing import Union
from requests.compat import urljoin
from scrapers import scraper_map
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-4o-mini-2024-07-18')
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
extractor = GeneralNewsExtractor()
def tag_visible(element: Comment) -> bool:
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
return False
if isinstance(element, Comment):
return False
return True
def text_from_soup(soup: BeautifulSoup) -> str:
res = []
texts = soup.find_all(string=True)
visible_texts = filter(tag_visible, texts)
for v in visible_texts:
res.append(v)
text = "\n".join(res)
return text.strip()
sys_info = '''Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content:
- The document's title
- The complete main content, as it appears in the HTML, comprising all textual elements considered part of the core article body
- The publication time in its original format found within the HTML
Ensure your response fits the following JSON structure, accurately reflecting the extracted data without modification:
```json
{
"title": "The Document's Exact Title",
"content": "All the unaltered primary text content from the article",
"publish_time": "Original Publication Time as per HTML"
}
```
It is essential that your output adheres strictly to this format, with each field filled based on the untouched information extracted directly from the HTML source.'''
async def general_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
"""
Return article information dict and flag, negative number is error, 0 is no result, 1 is for article_list page,
11 is success
main work flow:
(for weixin public account articles, which startswith mp.weixin.qq use mp_crawler)
first get the content with httpx
then judge is article list (return all article url and flag 1) or article detail page
then try to use gne to extract the information
when fail, try to use a llm to analysis the html
"""
# 0. if there's a scraper for this domain, use it (such as mp.weixin.qq.com)
parsed_url = urlparse(url)
domain = parsed_url.netloc
base_url = f"{parsed_url.scheme}://{domain}"
if domain in scraper_map:
return await scraper_map[domain](url, logger)
# 1. get the content with httpx
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"can not reach\n{e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.error(e)
return -7, {}
# 2. judge is article list (return all article url and flag 1) or article detail page
page_source = response.text
if page_source:
text = page_source
else:
try:
text = response.content.decode('utf-8')
except UnicodeDecodeError:
try:
text = response.content.decode('gbk')
except Exception as e:
logger.error(f"can not decode html {e}")
return -7, {}
soup = BeautifulSoup(text, "html.parser")
# Note: The scheme used here is very crude,
# it is recommended to write a separate parser for specific business scenarios
# Parse all URLs
if len(url) < 50:
urls = set()
for link in soup.find_all("a", href=True):
absolute_url = urljoin(base_url, link["href"])
format_url = urlparse(absolute_url)
# only record same domain links
if not format_url.netloc or format_url.netloc != domain:
continue
# remove hash fragment
absolute_url = f"{format_url.scheme}://{format_url.netloc}{format_url.path}{format_url.params}{format_url.query}"
if absolute_url != url:
urls.add(absolute_url)
if len(urls) > 24:
logger.info(f"{url} is more like an article list page, find {len(urls)} urls with the same netloc")
return 1, urls
# 3. try to use gne to extract the information
try:
result = extractor.extract(text)
if 'meta' in result:
del result['meta']
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
'title'].startswith('403') \
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"gne extract not good: {result}")
result = None
except Exception as e:
logger.info(f"gne extract error: {e}")
result = None
# 4. try to use a llm to analysis the html
if not result:
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
logger.info(f"{url} content too long for llm parsing")
return 0, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
'您访问的页面') or html_text.startswith('403') \
or html_text.startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
messages = [
{"role": "system", "content": sys_info},
{"role": "user", "content": html_text}
]
llm_output = openai_llm(messages, model=model, logger=logger, temperature=0.01)
result = json_repair.repair_json(llm_output, return_objects=True)
logger.debug(f"decoded_object: {result}")
if not isinstance(result, dict):
logger.debug("failed to parse from llm output")
return 0, {}
if 'title' not in result or 'content' not in result:
logger.debug("llm parsed result not good")
return 0, {}
# Extract the picture link, it will be empty if it cannot be extracted.
image_links = []
images = soup.find_all("img")
for img in images:
try:
image_links.append(urljoin(base_url, img["src"]))
except KeyError:
continue
result["images"] = image_links
# Extract the author information, if it cannot be extracted, it will be empty.
author_element = soup.find("meta", {"name": "author"})
if author_element:
result["author"] = author_element["content"]
else:
result["author"] = ""
# 5. post process
date_str = extract_and_convert_dates(result.get('publish_time', ''))
if date_str:
result['publish_time'] = date_str
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
from_site = domain.replace('www.', '')
from_site = from_site.split('.')[0]
result['content'] = f"[from {from_site}] {result['content']}"
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
else:
result['abstract'] = ''
except Exception:
result['abstract'] = ''
result['url'] = url
return 11, result

View File

@ -1,129 +0,0 @@
# -*- coding: utf-8 -*-
from typing import Union
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re
import asyncio
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
url = url.replace("http://", "https://", 1)
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"{e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(e)
return -7, {}
soup = BeautifulSoup(response.text, 'html.parser')
if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
# 文章目录
urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
simple_urls = set()
for url in urls:
cut_off_point = url.find('chksm=')
if cut_off_point != -1:
url = url[:cut_off_point - 1]
simple_urls.add(url)
return 1, simple_urls
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
# card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
return -7, {}
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"[from {profile_nickname}]{content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"[from {profile_nickname}]{summary}"
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}"
return 11, {
'title': rich_media_title,
'author': profile_nickname,
'publish_time': publish_time,
'abstract': abstract,
'content': content,
'images': list(images),
'url': url,
}

View File

@ -1,5 +1,5 @@
#!/bin/bash
set -o allexport
source ../.env
source .env
set +o allexport
exec uvicorn backend:app --reload --host localhost --port 8077

View File

@ -1,5 +1,5 @@
#!/bin/bash
set -o allexport
source ../.env
source .env
set +o allexport
exec python tasks.py

View File

@ -1,5 +1,5 @@
import asyncio
from insights import pipeline, pb, logger
from agents import pipeline, pb, logger
counter = 1

View File

@ -1,7 +1,8 @@
from urllib.parse import urlparse
import os
import re
import jieba
# import jieba
from loguru import logger
def isURL(string):
@ -72,35 +73,27 @@ def extract_and_convert_dates(input_string):
break
if matches:
return ''.join(matches[0])
return None
return ''
def get_logger_level() -> str:
level_map = {
'silly': 'CRITICAL',
'verbose': 'DEBUG',
'info': 'INFO',
'warn': 'WARNING',
'error': 'ERROR',
}
level: str = os.environ.get('WS_LOG', 'info').lower()
if level not in level_map:
raise ValueError(
'WiseFlow LOG should support the values of `silly`, '
'`verbose`, `info`, `warn`, `error`'
)
return level_map.get(level, 'info')
def get_logger(logger_name: str, logger_file_path: str):
level = 'DEBUG' if os.environ.get("VERBOSE", "").lower() in ["true", "1"] else 'INFO'
logger_file = os.path.join(logger_file_path, f"{logger_name}.log")
if not os.path.exists(logger_file_path):
os.makedirs(logger_file_path)
logger.add(logger_file, level=level, backtrace=True, diagnose=True, rotation="50 MB")
return logger
"""
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
"""
Compare the similarity of a target phrase to each phrase in the phrase list.
: Param target_phrase: target phrase (str)
: Param phrase_list: list of str
: param threshold: similarity threshold (float)
: Return: list of phrases that satisfy the similarity condition (list of str)
"""
if not target_phrase:
return [] # The target phrase is empty, and the empty list is returned directly.
@ -112,3 +105,4 @@ def compare_phrase_with_list(target_phrase, phrase_list, threshold):
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
return similar_phrases
"""

View File

@ -29,7 +29,8 @@ class PbTalker:
def read(self, collection_name: str, fields: Optional[List[str]] = None, filter: str = '', skiptotal: bool = True) -> list:
results = []
for i in range(1, 10):
i = 1
while True:
try:
res = self.client.collection(collection_name).get_list(i, 500,
{"filter": filter,
@ -44,6 +45,7 @@ class PbTalker:
for _res in res.items:
attributes = vars(_res)
results.append(attributes)
i += 1
return results
def add(self, collection_name: str, body: Dict) -> str:

View File

@ -1,71 +1,12 @@
**Included Web Dashboard Example**: This is optional. If you only use the data processing functions or have your own downstream task program, you can ignore everything in this folder!
**预计在 V0.3.9 版本提供完整的用户侧api目前这里只是参考**
## Main Features
API 并不直接与 Core 关联api 也是针对数据存储(包含用户设置存储)进行操作,所以这里并不影响你直接使用 core。
1.Daily Insights Display
2.Daily Article Display
3.Appending Search for Specific Hot Topics (using Sogou engine)
4.Generating Word Reports for Specific Hot Topics
初始版本 API 预计包含:
**Note: The code here cannot be used directly. It is adapted to an older version of the backend. You need to study the latest backend code in the `core` folder and make changes, especially in parts related to database integration!**
-----------------------------------------------------------------
附带的web Dashboard 示例,并非必须,如果你只是使用数据处理功能,或者你有自己的下游任务程序,可以忽略这个文件夹内的一切!
## 主要功能
1. 每日insights展示
2. 每日文章展示
3. 指定热点追加搜索使用sougou引擎
4. 指定热点生成word报告
**注意这里的代码并不能直接使用它适配的是旧版本的后端程序你需要研究core文件夹下的最新后端代码进行更改尤其是跟数据库对接的部分**
-----------------------------------------------------------------
**付属のWebダッシュボードのサンプル**:これは必須ではありません。データ処理機能のみを使用する場合、または独自の下流タスクプログラムを持っている場合は、このフォルダ内のすべてを無視できます!
## 主な機能
1. 毎日のインサイト表示
2. 毎日の記事表示
3. 特定のホットトピックの追加検索Sogouエンジンを使用
4. 特定のホットトピックのWordレポートの生成
**注意:ここにあるコードは直接使用できません。古いバージョンのバックエンドに適合しています。`core`フォルダ内の最新のバックエンドコードを調べ、特にデータベースとの連携部分について変更を行う必要があります!**
-----------------------------------------------------------------
**Exemple de tableau de bord Web inclus** : Ceci est facultatif. Si vous n'utilisez que les fonctions de traitement des données ou si vous avez votre propre programme de tâches en aval, vous pouvez ignorer tout ce qui se trouve dans ce dossier !
## Fonctions principales
1. Affichage des insights quotidiens
2. Affichage des articles quotidiens
3. Recherche supplémentaire pour des sujets populaires spécifiques (en utilisant le moteur Sogou)
4. Génération de rapports Word pour des sujets populaires spécifiques
**Remarque : Le code ici ne peut pas être utilisé directement. Il est adapté à une version plus ancienne du backend. Vous devez étudier le code backend le plus récent dans le dossier `core` et apporter des modifications, en particulier dans les parties relatives à l'intégration de la base de données !**
-----------------------------------------------------------------
**Beispiel eines enthaltenen Web-Dashboards**: Dies ist optional. Wenn Sie nur die Datenverarbeitungsfunktionen verwenden oder Ihr eigenes Downstream-Aufgabenprogramm haben, können Sie alles in diesem Ordner ignorieren!
## Hauptfunktionen
1. Tägliche Einblicke anzeigen
2. Tägliche Artikel anzeigen
3. Angehängte Suche nach spezifischen Hot Topics (unter Verwendung der Sogou-Suchmaschine)
4. Erstellen von Word-Berichten für spezifische Hot Topics
**Hinweis: Der Code hier kann nicht direkt verwendet werden. Er ist an eine ältere Version des Backends angepasst. Sie müssen den neuesten Backend-Code im `core`-Ordner studieren und Änderungen vornehmen, insbesondere in den Teilen, die die Datenbankintegration betreffen!**
- 信源的增删改查;
- 兴趣点的增删改查;
- insights 的读取和查找;
- 文章的读取和查找;
- 简单的报告生成功能;
- 原始资料的翻译等。

View File

@ -22,7 +22,7 @@ class BackendService:
def report(self, insight_id: str, topics: list[str], comment: str) -> dict:
logger.debug(f'got new report request insight_id {insight_id}')
insight = pb.read('insights', filter=f'id="{insight_id}"')
insight = pb.read('agents', filter=f'id="{insight_id}"')
if not insight:
logger.error(f'insight {insight_id} not found')
return self.build_out(-2, 'insight not found')
@ -52,7 +52,7 @@ class BackendService:
if flag:
file = open(docx_file, 'rb')
message = pb.upload('insights', insight_id, 'docx', f'{insight_id}.docx', file)
message = pb.upload('agents', insight_id, 'docx', f'{insight_id}.docx', file)
file.close()
if message:
logger.debug(f'report success finish and update to: {message}')
@ -143,7 +143,7 @@ class BackendService:
def more_search(self, insight_id: str) -> dict:
logger.debug(f'got search request for insight {insight_id}')
insight = pb.read('insights', filter=f'id="{insight_id}"')
insight = pb.read('agents', filter=f'id="{insight_id}"')
if not insight:
logger.error(f'insight {insight_id} not found')
return self.build_out(-2, 'insight not found')
@ -169,7 +169,7 @@ class BackendService:
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(item, f, ensure_ascii=False, indent=4)
message = pb.update(collection_name='insights', id=insight_id, body={'articles': article_ids})
message = pb.update(collection_name='agents', id=insight_id, body={'articles': article_ids})
if message:
logger.debug(f'insight search success finish and update to: {message}')
return self.build_out(11, insight_id)