V0.3.8 readme

2025-02-02 18:28:46 +08:00 · 2025-01-24 21:42:52 +08:00 · 2025-01-24 21:42:52 +08:00 · fdfe5c0ec8
commit fdfe5c0ec8
parent b1ef7a23d1
10 changed files with 10 additions and 228 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -35,11 +35,11 @@

  Provided a custom extractor interface to allow users to customize according to actual needs.

- bug 修复以及其他改进（crawl4ai浏览器生命周期管理，异步 llm wrapper 等）（感谢 @tusik 贡献异步 llm wrapper）
+- bug 修复以及其他改进（crawl4ai浏览器生命周期管理，异步 llm wrapper 等）（感谢 @tusik 贡献）

  Bug fixes and other improvements (crawl4ai browser lifecycle management, asynchronous llm wrapper, etc.)

-  Thanks to @tusik for contributing the asynchronous LLM wrapper
+  Thanks to @tusik for contributing

 # V0.3.6
 - 改用 Crawl4ai 作为底层爬虫框架，其实Crawl4ai 和 Crawlee 的获取效果差别不大，二者也都是基于 Playwright ，但 Crawl4ai 的 html2markdown 功能很实用，而这对llm 信息提取作用很大，另外 Crawl4ai 的架构也更加符合我的思路；
--- a/README.md
+++ b/README.md
@ -89,10 +89,6 @@ wiseflow 0.3.x版本使用 pocketbase 作为数据库，你当然也可以手动

 🌟 **这里与之前版本不同**，V0.3.5开始需要把 .env 放置在 [core](./core) 文件夹中。

-**windows 用户可以参考 core文件夹下的 windows.env windows_run.py 文件，执行 windows_run.py 脚本**
-
-感谢 @c469591 贡献的 windows 下原生启动脚本
-
 #### 3.1 大模型相关配置

 wiseflow 是 LLM 原生应用，请务必保证为程序提供稳定的 LLM 服务。
--- a/README_EN.md
+++ b/README_EN.md
@ -88,10 +88,6 @@ For details, please refer to [pb/README.md](/pb/README.md)

 🌟 **This is different from previous versions** - starting from V0.3.5, the .env file needs to be placed in the [core](./core) folder.

-**Windows users can refer to the windows.env and windows_run.py files in the core folder and execute the windows_run.py script**
-
-Thanks to @c469591 for contributing the native Windows startup script
-
 #### 3.1 Large Language Model Configuration

 Wiseflow is a LLM native application, so please ensure you provide stable LLM service for the program.
--- a/README_JP.md
+++ b/README_JP.md
@ -89,10 +89,6 @@ Wiseflow 0.3.xはデータベースとしてpocketbaseを使用しています

 🌟 **これは以前のバージョンとは異なります** - V0.3.5以降、.envファイルは[core](./core)フォルダに配置する必要があります。

-**Windowsユーザーはcoreフォルダ内のwindows.envとwindows_run.pyファイルを参照し、windows_run.pyスクリプトを実行してください**
-
-@c469591によるWindows用ネイティブ起動スクリプトの貢献に感謝いたします
-
 #### 3.1 大規模言語モデルの設定

 Wiseflowは LLMネイティブアプリケーションですので、プログラムに安定したLLMサービスを提供するようにしてください。
--- a/README_KR.md
+++ b/README_KR.md
@ -88,10 +88,6 @@ Wiseflow 0.3.x는 데이터베이스로 pocketbase를 사용합니다. pocketbas

 🌟 **이전 버전과 다릅니다** - V0.3.5부터 .env 파일은 [core](./core) 폴더에 위치해야 합니다.

-**windows 사용자는 core 폴더의 windows.env와 windows_run.py 파일을 참조하여 windows_run.py 스크립트를 실행할 수 있습니다**
-
-@c469591님이 기여해 주신 windows용 네이티브 실행 스크립트에 감사드립니다
-
 #### 3.1 대규모 언어 모델 구성

 Wiseflow는 LLM 네이티브 애플리케이션이므로 프로그램에 안정적인 LLM 서비스를 제공하도록 해주세요.
--- a/core/windows.env
+++ b/core/windows.env
@ -1,6 +0,0 @@
-LLM_API_KEY=""
-LLM_API_BASE="https://api.openai.com/v1/"
-PRIMARY_MODEL="gpt-4o-2024-11-20"
-VL_MODEL="gpt-4o-2024-11-20"
-PB_API_AUTH="your_email|your_password"
-PROJECT_DIR="work_dir"
--- a/core/windows_general_process.py
+++ b/core/windows_general_process.py
@ -1,192 +0,0 @@
-# -*- coding: utf-8 -*-
-# general_process.py
-from pathlib import Path
-from dotenv import load_dotenv
-
-# 加載環境變數
-env_path = Path(__file__).parent / 'windows.env'
-if env_path.exists():
-    load_dotenv(env_path)
-
-import os
-from utils.pb_api import PbTalker
-from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
-from agents.get_info import *
-import json
-import asyncio
-from scrapers import *
-from urllib.parse import urlparse
-from crawl4ai import AsyncWebCrawler, CacheMode
-from datetime import datetime, timedelta
-import logging
-
-logging.getLogger("httpx").setLevel(logging.WARNING)
-
-project_dir = os.environ.get("PROJECT_DIR", "")
-if project_dir:
-    os.makedirs(project_dir, exist_ok=True)
-
-wiseflow_logger = get_logger('general_process', project_dir)
-pb = PbTalker(wiseflow_logger)
-one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
-existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
-
-crawler = AsyncWebCrawler(verbose=False)
-model = os.environ.get("PRIMARY_MODEL", "")
-if not model:
-    raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
-secondary_model = os.environ.get("SECONDARY_MODEL", model)
-
-async def save_to_pb(url: str, url_title: str, infos: list):
-    # saving to pb process
-    for info in infos:
-        info['url'] = url
-        info['url_title'] = url_title
-        _ = pb.add(collection_name='infos', body=info)
-        if not _:
-            wiseflow_logger.error('add info failed, writing to cache_file')
-            timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
-            with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f:
-                json.dump(info, f, ensure_ascii=False, indent=4)
-
-
-async def main_process(_sites: set | list):
-    # collect tags user set in pb database and determin the system prompt language based on tags
-    focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
-    if not focus_data:
-        wiseflow_logger.info('no activated tag found, will ask user to create one')
-        focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
-                    'so please input one now. describe what info you care about shortly: ')
-        explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ')
-        focus_data.append({"focuspoint": focus, "explanation": explanation,
-                            "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
-
-
-    focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
-    focus_statement = ''
-    for item in focus_data:
-        tag = item["focuspoint"]
-        expl = item["explanation"]
-        focus_statement = f"{focus_statement}//{tag}//\n"
-        if expl:
-            if is_chinese(expl):
-                focus_statement = f"{focus_statement}解释：{expl}\n"
-            else:
-                focus_statement = f"{focus_statement}Explanation: {expl}\n"
-
-    date_stamp = datetime.now().strftime('%Y-%m-%d')
-    if is_chinese(focus_statement):
-        get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
-        get_link_sys_prompt = f"今天的日期是{date_stamp}，{get_link_sys_prompt}"
-        get_link_suffix_prompt = get_link_suffix
-        get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
-        get_info_sys_prompt = f"今天的日期是{date_stamp}，{get_info_sys_prompt}"
-        get_info_suffix_prompt = get_info_suffix
-    else:
-        get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
-        get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
-        get_link_suffix_prompt = get_link_suffix_en
-        get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
-        get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
-        get_info_suffix_prompt = get_info_suffix_en
-
-    recognized_img_cache = {}
-    working_list = set()
-    working_list.update(_sites)
-    await crawler.start()
-    while working_list:
-        url = working_list.pop()
-        existing_urls.add(url)
-        wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list')
-        has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
-        if has_common_ext:
-            wiseflow_logger.debug(f'{url} is a common file, skip')
-            continue
-
-        parsed_url = urlparse(url)
-        existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
-        existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
-        domain = parsed_url.netloc
-        if domain in custom_fetching_configs:
-            wiseflow_logger.debug(f'{url} will using custom crawl4ai run config')
-            run_config = custom_fetching_configs[domain]
-        else:
-            run_config = crawler_config
-            
-        run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
-        result = await crawler.arun(url=url, config=run_config)
-        if not result.success:
-            wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
-            continue
-        metadata_dict = result.metadata if result.metadata else {}
-
-        if domain in custom_scrapers:
-            result = custom_scrapers[domain](result)
-            raw_markdown = result.content
-            used_img = result.images
-            title = result.title
-            base_url = result.base
-            author = result.author
-            publish_date = result.publish_date
-        else:
-            raw_markdown = result.markdown
-            media_dict = result.media if result.media else {}
-            used_img = [d['src'] for d in media_dict.get('images', [])]
-            title = ''
-            base_url = ''
-            author = ''
-            publish_date = ''
-        if not raw_markdown:
-            wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
-            continue
-
-        if not title:
-            title = metadata_dict.get('title', '')
-        if not base_url:
-            base_url = metadata_dict.get('base', '')
-        if not base_url:
-            base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
-
-        if not author:
-            author = metadata_dict.get('author', '')
-        if not publish_date:
-            publish_date = metadata_dict.get('publish_date', '')
-
-        link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
-
-        if link_dict and links_parts:
-            prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
-            links_texts = []
-            for _parts in links_parts:
-                links_texts.extend(_parts.split('\n\n'))
-            more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
-            if more_url:
-                wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list')
-                working_list.update(more_url - existing_urls)
-            
-        if not contents:
-            continue
-
-        if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
-            author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger)
-
-        if not author or author.lower() == 'na':
-            author = parsed_url.netloc
-        
-        if publish_date:
-            publish_date = extract_and_convert_dates(publish_date)
-        else:
-            publish_date = date_stamp
-
-        prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
-        infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
-        if infos:
-            wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb')
-            await save_to_pb(url, title, infos)
-    await crawler.close()
-
-if __name__ == '__main__':
-
-    sites = pb.read('sites', filter='activated=True')
-    wiseflow_logger.info('execute all sites one time')
-    asyncio.run(main_process([site['url'] for site in sites]))
--- a/core/windows_run.py
+++ b/core/windows_run.py
@ -72,7 +72,6 @@ def main():
                print(f"Error: run_task.py not found at: {process_script}")
        except subprocess.CalledProcessError as e:
            print(f"Error running run_task.py: {e}")
-
    else:
        print("Failed to start services")

--- a/test/get_info_test.py
+++ b/test/get_info_test.py
@ -10,7 +10,6 @@ core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core
 sys.path.append(core_path)

 # 现在可以直接导入模块，因为core目录已经在Python路径中
-from scrapers import *
 from utils.general_utils import is_chinese
 from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls
 from agents.get_info_prompts import *
@ -20,7 +19,7 @@ benchmark_model = 'Qwen/Qwen2.5-72B-Instruct'
 # models = ['deepseek-reasoner']
 models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct',  'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']

-async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str):
+async def main(sample: dict, include_ap: bool, prompts: list, record_file: str):
    link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents']
    get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts

@ -91,16 +90,16 @@ if __name__ == '__main__':
    focus_points = json.load(open(os.path.join(sample_dir, 'focus_point.json'), 'r'))
    focus_statement = ''
    for item in focus_points:
-        tag = item["focuspoint"]
-        expl = item["explanation"]
-        focus_statement = f"{focus_statement}//{tag}//\n"
+        tag = item["focuspoint"].strip()
+        expl = item["explanation"].strip()
+        focus_statement = f"{focus_statement}//{tag}//"
        if expl:
            if is_chinese(expl):
-                focus_statement = f"{focus_statement}解释：{expl}\n"
+                focus_statement = f"{focus_statement}\n解释：{expl}\n"
            else:
-                focus_statement = f"{focus_statement}Explanation: {expl}\n"
+                focus_statement = f"{focus_statement}\nExplanation: {expl}\n"
    
-    focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
+    #focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
    date_stamp = datetime.now().strftime('%Y-%m-%d')
    if is_chinese(focus_statement):
        get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
@ -134,4 +133,4 @@ if __name__ == '__main__':
        with open(record_file, 'a') as f:
            f.write(f"raw materials: {file}\n\n")
        print(f'start testing {file}')
-        asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file))
+        asyncio.run(main(sample, include_ap, prompts, record_file))
--- a/weixin_mp/init.py
+++ b/weixin_mp/init.py
@ -19,9 +19,7 @@ logging.getLogger("httpx").setLevel(logging.WARNING)

 from general_process import main_process, wiseflow_logger, pb
 from typing import Optional
-import logging

-logging.getLogger("httpx").setLevel(logging.WARNING)

 # 千万注意扫码登录时不要选择"同步历史消息"，否则会造成 bot 上来挨个回复历史消息
 # 先检查下 wx 的登录状态，同时获取已登录微信的 wxid