diff --git a/CHANGELOG.md b/CHANGELOG.md index a90130f..67472cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,11 +35,11 @@ Provided a custom extractor interface to allow users to customize according to actual needs. -- bug 修复以及其他改进(crawl4ai浏览器生命周期管理,异步 llm wrapper 等)(感谢 @tusik 贡献异步 llm wrapper) +- bug 修复以及其他改进(crawl4ai浏览器生命周期管理,异步 llm wrapper 等)(感谢 @tusik 贡献) Bug fixes and other improvements (crawl4ai browser lifecycle management, asynchronous llm wrapper, etc.) - Thanks to @tusik for contributing the asynchronous LLM wrapper + Thanks to @tusik for contributing # V0.3.6 - 改用 Crawl4ai 作为底层爬虫框架,其实Crawl4ai 和 Crawlee 的获取效果差别不大,二者也都是基于 Playwright ,但 Crawl4ai 的 html2markdown 功能很实用,而这对llm 信息提取作用很大,另外 Crawl4ai 的架构也更加符合我的思路; diff --git a/README.md b/README.md index 8666d87..904dc86 100644 --- a/README.md +++ b/README.md @@ -89,10 +89,6 @@ wiseflow 0.3.x版本使用 pocketbase 作为数据库,你当然也可以手动 🌟 **这里与之前版本不同**,V0.3.5开始需要把 .env 放置在 [core](./core) 文件夹中。 -**windows 用户可以参考 core文件夹下的 windows.env windows_run.py 文件,执行 windows_run.py 脚本** - -感谢 @c469591 贡献的 windows 下原生启动脚本 - #### 3.1 大模型相关配置 wiseflow 是 LLM 原生应用,请务必保证为程序提供稳定的 LLM 服务。 diff --git a/README_EN.md b/README_EN.md index 18e2370..17deeca 100644 --- a/README_EN.md +++ b/README_EN.md @@ -88,10 +88,6 @@ For details, please refer to [pb/README.md](/pb/README.md) 🌟 **This is different from previous versions** - starting from V0.3.5, the .env file needs to be placed in the [core](./core) folder. -**Windows users can refer to the windows.env and windows_run.py files in the core folder and execute the windows_run.py script** - -Thanks to @c469591 for contributing the native Windows startup script - #### 3.1 Large Language Model Configuration Wiseflow is a LLM native application, so please ensure you provide stable LLM service for the program. diff --git a/README_JP.md b/README_JP.md index b78b0d1..4ada65c 100644 --- a/README_JP.md +++ b/README_JP.md @@ -89,10 +89,6 @@ Wiseflow 0.3.xはデータベースとしてpocketbaseを使用しています 🌟 **これは以前のバージョンとは異なります** - V0.3.5以降、.envファイルは[core](./core)フォルダに配置する必要があります。 -**Windowsユーザーはcoreフォルダ内のwindows.envとwindows_run.pyファイルを参照し、windows_run.pyスクリプトを実行してください** - -@c469591によるWindows用ネイティブ起動スクリプトの貢献に感謝いたします - #### 3.1 大規模言語モデルの設定 Wiseflowは LLMネイティブアプリケーションですので、プログラムに安定したLLMサービスを提供するようにしてください。 diff --git a/README_KR.md b/README_KR.md index 8edf0d1..c75bc45 100644 --- a/README_KR.md +++ b/README_KR.md @@ -88,10 +88,6 @@ Wiseflow 0.3.x는 데이터베이스로 pocketbase를 사용합니다. pocketbas 🌟 **이전 버전과 다릅니다** - V0.3.5부터 .env 파일은 [core](./core) 폴더에 위치해야 합니다. -**windows 사용자는 core 폴더의 windows.env와 windows_run.py 파일을 참조하여 windows_run.py 스크립트를 실행할 수 있습니다** - -@c469591님이 기여해 주신 windows용 네이티브 실행 스크립트에 감사드립니다 - #### 3.1 대규모 언어 모델 구성 Wiseflow는 LLM 네이티브 애플리케이션이므로 프로그램에 안정적인 LLM 서비스를 제공하도록 해주세요. diff --git a/core/windows.env b/core/windows.env deleted file mode 100644 index 2292d30..0000000 --- a/core/windows.env +++ /dev/null @@ -1,6 +0,0 @@ -LLM_API_KEY="" -LLM_API_BASE="https://api.openai.com/v1/" -PRIMARY_MODEL="gpt-4o-2024-11-20" -VL_MODEL="gpt-4o-2024-11-20" -PB_API_AUTH="your_email|your_password" -PROJECT_DIR="work_dir" \ No newline at end of file diff --git a/core/windows_general_process.py b/core/windows_general_process.py deleted file mode 100644 index a2ca827..0000000 --- a/core/windows_general_process.py +++ /dev/null @@ -1,192 +0,0 @@ -# -*- coding: utf-8 -*- -# general_process.py -from pathlib import Path -from dotenv import load_dotenv - -# 加載環境變數 -env_path = Path(__file__).parent / 'windows.env' -if env_path.exists(): - load_dotenv(env_path) - -import os -from utils.pb_api import PbTalker -from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese -from agents.get_info import * -import json -import asyncio -from scrapers import * -from urllib.parse import urlparse -from crawl4ai import AsyncWebCrawler, CacheMode -from datetime import datetime, timedelta -import logging - -logging.getLogger("httpx").setLevel(logging.WARNING) - -project_dir = os.environ.get("PROJECT_DIR", "") -if project_dir: - os.makedirs(project_dir, exist_ok=True) - -wiseflow_logger = get_logger('general_process', project_dir) -pb = PbTalker(wiseflow_logger) -one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') -existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")} - -crawler = AsyncWebCrawler(verbose=False) -model = os.environ.get("PRIMARY_MODEL", "") -if not model: - raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env") -secondary_model = os.environ.get("SECONDARY_MODEL", model) - -async def save_to_pb(url: str, url_title: str, infos: list): - # saving to pb process - for info in infos: - info['url'] = url - info['url_title'] = url_title - _ = pb.add(collection_name='infos', body=info) - if not _: - wiseflow_logger.error('add info failed, writing to cache_file') - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") - with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f: - json.dump(info, f, ensure_ascii=False, indent=4) - - -async def main_process(_sites: set | list): - # collect tags user set in pb database and determin the system prompt language based on tags - focus_data = pb.read(collection_name='focus_points', filter=f'activated=True') - if not focus_data: - wiseflow_logger.info('no activated tag found, will ask user to create one') - focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.' - 'so please input one now. describe what info you care about shortly: ') - explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ') - focus_data.append({"focuspoint": focus, "explanation": explanation, - "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})}) - - - focus_dict = {item["focuspoint"]: item["id"] for item in focus_data} - focus_statement = '' - for item in focus_data: - tag = item["focuspoint"] - expl = item["explanation"] - focus_statement = f"{focus_statement}//{tag}//\n" - if expl: - if is_chinese(expl): - focus_statement = f"{focus_statement}解释:{expl}\n" - else: - focus_statement = f"{focus_statement}Explanation: {expl}\n" - - date_stamp = datetime.now().strftime('%Y-%m-%d') - if is_chinese(focus_statement): - get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement) - get_link_sys_prompt = f"今天的日期是{date_stamp},{get_link_sys_prompt}" - get_link_suffix_prompt = get_link_suffix - get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement) - get_info_sys_prompt = f"今天的日期是{date_stamp},{get_info_sys_prompt}" - get_info_suffix_prompt = get_info_suffix - else: - get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement) - get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}" - get_link_suffix_prompt = get_link_suffix_en - get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement) - get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}" - get_info_suffix_prompt = get_info_suffix_en - - recognized_img_cache = {} - working_list = set() - working_list.update(_sites) - await crawler.start() - while working_list: - url = working_list.pop() - existing_urls.add(url) - wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list') - has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) - if has_common_ext: - wiseflow_logger.debug(f'{url} is a common file, skip') - continue - - parsed_url = urlparse(url) - existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}") - existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/") - domain = parsed_url.netloc - if domain in custom_fetching_configs: - wiseflow_logger.debug(f'{url} will using custom crawl4ai run config') - run_config = custom_fetching_configs[domain] - else: - run_config = crawler_config - - run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED - result = await crawler.arun(url=url, config=run_config) - if not result.success: - wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') - continue - metadata_dict = result.metadata if result.metadata else {} - - if domain in custom_scrapers: - result = custom_scrapers[domain](result) - raw_markdown = result.content - used_img = result.images - title = result.title - base_url = result.base - author = result.author - publish_date = result.publish_date - else: - raw_markdown = result.markdown - media_dict = result.media if result.media else {} - used_img = [d['src'] for d in media_dict.get('images', [])] - title = '' - base_url = '' - author = '' - publish_date = '' - if not raw_markdown: - wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') - continue - - if not title: - title = metadata_dict.get('title', '') - if not base_url: - base_url = metadata_dict.get('base', '') - if not base_url: - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - - if not author: - author = metadata_dict.get('author', '') - if not publish_date: - publish_date = metadata_dict.get('publish_date', '') - - link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls) - - if link_dict and links_parts: - prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model] - links_texts = [] - for _parts in links_parts: - links_texts.extend(_parts.split('\n\n')) - more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger) - if more_url: - wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list') - working_list.update(more_url - existing_urls) - - if not contents: - continue - - if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na': - author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger) - - if not author or author.lower() == 'na': - author = parsed_url.netloc - - if publish_date: - publish_date = extract_and_convert_dates(publish_date) - else: - publish_date = date_stamp - - prompts = [get_info_sys_prompt, get_info_suffix_prompt, model] - infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger) - if infos: - wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb') - await save_to_pb(url, title, infos) - await crawler.close() - -if __name__ == '__main__': - - sites = pb.read('sites', filter='activated=True') - wiseflow_logger.info('execute all sites one time') - asyncio.run(main_process([site['url'] for site in sites])) diff --git a/core/windows_run.py b/core/windows_run.py index 8b60f3b..68bb651 100644 --- a/core/windows_run.py +++ b/core/windows_run.py @@ -72,7 +72,6 @@ def main(): print(f"Error: run_task.py not found at: {process_script}") except subprocess.CalledProcessError as e: print(f"Error running run_task.py: {e}") - else: print("Failed to start services") diff --git a/test/get_info_test.py b/test/get_info_test.py index f9a9371..0c5ebed 100644 --- a/test/get_info_test.py +++ b/test/get_info_test.py @@ -10,7 +10,6 @@ core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core sys.path.append(core_path) # 现在可以直接导入模块,因为core目录已经在Python路径中 -from scrapers import * from utils.general_utils import is_chinese from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls from agents.get_info_prompts import * @@ -20,7 +19,7 @@ benchmark_model = 'Qwen/Qwen2.5-72B-Instruct' # models = ['deepseek-reasoner'] models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5'] -async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str): +async def main(sample: dict, include_ap: bool, prompts: list, record_file: str): link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents'] get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts @@ -91,16 +90,16 @@ if __name__ == '__main__': focus_points = json.load(open(os.path.join(sample_dir, 'focus_point.json'), 'r')) focus_statement = '' for item in focus_points: - tag = item["focuspoint"] - expl = item["explanation"] - focus_statement = f"{focus_statement}//{tag}//\n" + tag = item["focuspoint"].strip() + expl = item["explanation"].strip() + focus_statement = f"{focus_statement}//{tag}//" if expl: if is_chinese(expl): - focus_statement = f"{focus_statement}解释:{expl}\n" + focus_statement = f"{focus_statement}\n解释:{expl}\n" else: - focus_statement = f"{focus_statement}Explanation: {expl}\n" + focus_statement = f"{focus_statement}\nExplanation: {expl}\n" - focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points} + #focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points} date_stamp = datetime.now().strftime('%Y-%m-%d') if is_chinese(focus_statement): get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement) @@ -134,4 +133,4 @@ if __name__ == '__main__': with open(record_file, 'a') as f: f.write(f"raw materials: {file}\n\n") print(f'start testing {file}') - asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file)) + asyncio.run(main(sample, include_ap, prompts, record_file)) diff --git a/weixin_mp/__init__.py b/weixin_mp/__init__.py index 955f8e0..77a7d0e 100644 --- a/weixin_mp/__init__.py +++ b/weixin_mp/__init__.py @@ -19,9 +19,7 @@ logging.getLogger("httpx").setLevel(logging.WARNING) from general_process import main_process, wiseflow_logger, pb from typing import Optional -import logging -logging.getLogger("httpx").setLevel(logging.WARNING) # 千万注意扫码登录时不要选择"同步历史消息",否则会造成 bot 上来挨个回复历史消息 # 先检查下 wx 的登录状态,同时获取已登录微信的 wxid