V0.3.8 readme

This commit is contained in:
bigbrother666sh 2025-01-24 21:42:52 +08:00
parent b1ef7a23d1
commit fdfe5c0ec8
10 changed files with 10 additions and 228 deletions

View File

@ -35,11 +35,11 @@
Provided a custom extractor interface to allow users to customize according to actual needs.
- bug 修复以及其他改进crawl4ai浏览器生命周期管理异步 llm wrapper 等)(感谢 @tusik 贡献异步 llm wrapper
- bug 修复以及其他改进crawl4ai浏览器生命周期管理异步 llm wrapper 等)(感谢 @tusik 贡献)
Bug fixes and other improvements (crawl4ai browser lifecycle management, asynchronous llm wrapper, etc.)
Thanks to @tusik for contributing the asynchronous LLM wrapper
Thanks to @tusik for contributing
# V0.3.6
- 改用 Crawl4ai 作为底层爬虫框架其实Crawl4ai 和 Crawlee 的获取效果差别不大,二者也都是基于 Playwright ,但 Crawl4ai 的 html2markdown 功能很实用而这对llm 信息提取作用很大,另外 Crawl4ai 的架构也更加符合我的思路;

View File

@ -89,10 +89,6 @@ wiseflow 0.3.x版本使用 pocketbase 作为数据库,你当然也可以手动
🌟 **这里与之前版本不同**V0.3.5开始需要把 .env 放置在 [core](./core) 文件夹中。
**windows 用户可以参考 core文件夹下的 windows.env windows_run.py 文件,执行 windows_run.py 脚本**
感谢 @c469591 贡献的 windows 下原生启动脚本
#### 3.1 大模型相关配置
wiseflow 是 LLM 原生应用,请务必保证为程序提供稳定的 LLM 服务。

View File

@ -88,10 +88,6 @@ For details, please refer to [pb/README.md](/pb/README.md)
🌟 **This is different from previous versions** - starting from V0.3.5, the .env file needs to be placed in the [core](./core) folder.
**Windows users can refer to the windows.env and windows_run.py files in the core folder and execute the windows_run.py script**
Thanks to @c469591 for contributing the native Windows startup script
#### 3.1 Large Language Model Configuration
Wiseflow is a LLM native application, so please ensure you provide stable LLM service for the program.

View File

@ -89,10 +89,6 @@ Wiseflow 0.3.xはデータベースとしてpocketbaseを使用しています
🌟 **これは以前のバージョンとは異なります** - V0.3.5以降、.envファイルは[core](./core)フォルダに配置する必要があります。
**Windowsユーザーはcoreフォルダ内のwindows.envとwindows_run.pyファイルを参照し、windows_run.pyスクリプトを実行してください**
@c469591によるWindows用ネイティブ起動スクリプトの貢献に感謝いたします
#### 3.1 大規模言語モデルの設定
Wiseflowは LLMネイティブアプリケーションですので、プログラムに安定したLLMサービスを提供するようにしてください。

View File

@ -88,10 +88,6 @@ Wiseflow 0.3.x는 데이터베이스로 pocketbase를 사용합니다. pocketbas
🌟 **이전 버전과 다릅니다** - V0.3.5부터 .env 파일은 [core](./core) 폴더에 위치해야 합니다.
**windows 사용자는 core 폴더의 windows.env와 windows_run.py 파일을 참조하여 windows_run.py 스크립트를 실행할 수 있습니다**
@c469591님이 기여해 주신 windows용 네이티브 실행 스크립트에 감사드립니다
#### 3.1 대규모 언어 모델 구성
Wiseflow는 LLM 네이티브 애플리케이션이므로 프로그램에 안정적인 LLM 서비스를 제공하도록 해주세요.

View File

@ -1,6 +0,0 @@
LLM_API_KEY=""
LLM_API_BASE="https://api.openai.com/v1/"
PRIMARY_MODEL="gpt-4o-2024-11-20"
VL_MODEL="gpt-4o-2024-11-20"
PB_API_AUTH="your_email|your_password"
PROJECT_DIR="work_dir"

View File

@ -1,192 +0,0 @@
# -*- coding: utf-8 -*-
# general_process.py
from pathlib import Path
from dotenv import load_dotenv
# 加載環境變數
env_path = Path(__file__).parent / 'windows.env'
if env_path.exists():
load_dotenv(env_path)
import os
from utils.pb_api import PbTalker
from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
from agents.get_info import *
import json
import asyncio
from scrapers import *
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, CacheMode
from datetime import datetime, timedelta
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
wiseflow_logger = get_logger('general_process', project_dir)
pb = PbTalker(wiseflow_logger)
one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
crawler = AsyncWebCrawler(verbose=False)
model = os.environ.get("PRIMARY_MODEL", "")
if not model:
raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
secondary_model = os.environ.get("SECONDARY_MODEL", model)
async def save_to_pb(url: str, url_title: str, infos: list):
# saving to pb process
for info in infos:
info['url'] = url
info['url_title'] = url_title
_ = pb.add(collection_name='infos', body=info)
if not _:
wiseflow_logger.error('add info failed, writing to cache_file')
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=4)
async def main_process(_sites: set | list):
# collect tags user set in pb database and determin the system prompt language based on tags
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
if not focus_data:
wiseflow_logger.info('no activated tag found, will ask user to create one')
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
'so please input one now. describe what info you care about shortly: ')
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ')
focus_data.append({"focuspoint": focus, "explanation": explanation,
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
focus_statement = ''
for item in focus_data:
tag = item["focuspoint"]
expl = item["explanation"]
focus_statement = f"{focus_statement}//{tag}//\n"
if expl:
if is_chinese(expl):
focus_statement = f"{focus_statement}解释:{expl}\n"
else:
focus_statement = f"{focus_statement}Explanation: {expl}\n"
date_stamp = datetime.now().strftime('%Y-%m-%d')
if is_chinese(focus_statement):
get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"今天的日期是{date_stamp}{get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix
get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"今天的日期是{date_stamp}{get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix
else:
get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix_en
get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix_en
recognized_img_cache = {}
working_list = set()
working_list.update(_sites)
await crawler.start()
while working_list:
url = working_list.pop()
existing_urls.add(url)
wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list')
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
if has_common_ext:
wiseflow_logger.debug(f'{url} is a common file, skip')
continue
parsed_url = urlparse(url)
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
domain = parsed_url.netloc
if domain in custom_fetching_configs:
wiseflow_logger.debug(f'{url} will using custom crawl4ai run config')
run_config = custom_fetching_configs[domain]
else:
run_config = crawler_config
run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
result = await crawler.arun(url=url, config=run_config)
if not result.success:
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
continue
metadata_dict = result.metadata if result.metadata else {}
if domain in custom_scrapers:
result = custom_scrapers[domain](result)
raw_markdown = result.content
used_img = result.images
title = result.title
base_url = result.base
author = result.author
publish_date = result.publish_date
else:
raw_markdown = result.markdown
media_dict = result.media if result.media else {}
used_img = [d['src'] for d in media_dict.get('images', [])]
title = ''
base_url = ''
author = ''
publish_date = ''
if not raw_markdown:
wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
continue
if not title:
title = metadata_dict.get('title', '')
if not base_url:
base_url = metadata_dict.get('base', '')
if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not author:
author = metadata_dict.get('author', '')
if not publish_date:
publish_date = metadata_dict.get('publish_date', '')
link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
if link_dict and links_parts:
prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
links_texts = []
for _parts in links_parts:
links_texts.extend(_parts.split('\n\n'))
more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
if more_url:
wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list')
working_list.update(more_url - existing_urls)
if not contents:
continue
if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger)
if not author or author.lower() == 'na':
author = parsed_url.netloc
if publish_date:
publish_date = extract_and_convert_dates(publish_date)
else:
publish_date = date_stamp
prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
if infos:
wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb')
await save_to_pb(url, title, infos)
await crawler.close()
if __name__ == '__main__':
sites = pb.read('sites', filter='activated=True')
wiseflow_logger.info('execute all sites one time')
asyncio.run(main_process([site['url'] for site in sites]))

View File

@ -72,7 +72,6 @@ def main():
print(f"Error: run_task.py not found at: {process_script}")
except subprocess.CalledProcessError as e:
print(f"Error running run_task.py: {e}")
else:
print("Failed to start services")

View File

@ -10,7 +10,6 @@ core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core
sys.path.append(core_path)
# 现在可以直接导入模块因为core目录已经在Python路径中
from scrapers import *
from utils.general_utils import is_chinese
from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls
from agents.get_info_prompts import *
@ -20,7 +19,7 @@ benchmark_model = 'Qwen/Qwen2.5-72B-Instruct'
# models = ['deepseek-reasoner']
models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']
async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str):
async def main(sample: dict, include_ap: bool, prompts: list, record_file: str):
link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents']
get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts
@ -91,16 +90,16 @@ if __name__ == '__main__':
focus_points = json.load(open(os.path.join(sample_dir, 'focus_point.json'), 'r'))
focus_statement = ''
for item in focus_points:
tag = item["focuspoint"]
expl = item["explanation"]
focus_statement = f"{focus_statement}//{tag}//\n"
tag = item["focuspoint"].strip()
expl = item["explanation"].strip()
focus_statement = f"{focus_statement}//{tag}//"
if expl:
if is_chinese(expl):
focus_statement = f"{focus_statement}解释:{expl}\n"
focus_statement = f"{focus_statement}\n解释:{expl}\n"
else:
focus_statement = f"{focus_statement}Explanation: {expl}\n"
focus_statement = f"{focus_statement}\nExplanation: {expl}\n"
focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
#focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
date_stamp = datetime.now().strftime('%Y-%m-%d')
if is_chinese(focus_statement):
get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
@ -134,4 +133,4 @@ if __name__ == '__main__':
with open(record_file, 'a') as f:
f.write(f"raw materials: {file}\n\n")
print(f'start testing {file}')
asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file))
asyncio.run(main(sample, include_ap, prompts, record_file))

View File

@ -19,9 +19,7 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
from general_process import main_process, wiseflow_logger, pb
from typing import Optional
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)
# 千万注意扫码登录时不要选择"同步历史消息",否则会造成 bot 上来挨个回复历史消息
# 先检查下 wx 的登录状态,同时获取已登录微信的 wxid