diff --git a/core/windows.env b/core/windows.env new file mode 100644 index 0000000..2292d30 --- /dev/null +++ b/core/windows.env @@ -0,0 +1,6 @@ +LLM_API_KEY="" +LLM_API_BASE="https://api.openai.com/v1/" +PRIMARY_MODEL="gpt-4o-2024-11-20" +VL_MODEL="gpt-4o-2024-11-20" +PB_API_AUTH="your_email|your_password" +PROJECT_DIR="work_dir" \ No newline at end of file diff --git a/core/windows_general_process.py b/core/windows_general_process.py new file mode 100644 index 0000000..a2ca827 --- /dev/null +++ b/core/windows_general_process.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +# general_process.py +from pathlib import Path +from dotenv import load_dotenv + +# 加載環境變數 +env_path = Path(__file__).parent / 'windows.env' +if env_path.exists(): + load_dotenv(env_path) + +import os +from utils.pb_api import PbTalker +from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese +from agents.get_info import * +import json +import asyncio +from scrapers import * +from urllib.parse import urlparse +from crawl4ai import AsyncWebCrawler, CacheMode +from datetime import datetime, timedelta +import logging + +logging.getLogger("httpx").setLevel(logging.WARNING) + +project_dir = os.environ.get("PROJECT_DIR", "") +if project_dir: + os.makedirs(project_dir, exist_ok=True) + +wiseflow_logger = get_logger('general_process', project_dir) +pb = PbTalker(wiseflow_logger) +one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') +existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")} + +crawler = AsyncWebCrawler(verbose=False) +model = os.environ.get("PRIMARY_MODEL", "") +if not model: + raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env") +secondary_model = os.environ.get("SECONDARY_MODEL", model) + +async def save_to_pb(url: str, url_title: str, infos: list): + # saving to pb process + for info in infos: + info['url'] = url + info['url_title'] = url_title + _ = pb.add(collection_name='infos', body=info) + if not _: + wiseflow_logger.error('add info failed, writing to cache_file') + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f: + json.dump(info, f, ensure_ascii=False, indent=4) + + +async def main_process(_sites: set | list): + # collect tags user set in pb database and determin the system prompt language based on tags + focus_data = pb.read(collection_name='focus_points', filter=f'activated=True') + if not focus_data: + wiseflow_logger.info('no activated tag found, will ask user to create one') + focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.' + 'so please input one now. describe what info you care about shortly: ') + explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ') + focus_data.append({"focuspoint": focus, "explanation": explanation, + "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})}) + + + focus_dict = {item["focuspoint"]: item["id"] for item in focus_data} + focus_statement = '' + for item in focus_data: + tag = item["focuspoint"] + expl = item["explanation"] + focus_statement = f"{focus_statement}//{tag}//\n" + if expl: + if is_chinese(expl): + focus_statement = f"{focus_statement}解释:{expl}\n" + else: + focus_statement = f"{focus_statement}Explanation: {expl}\n" + + date_stamp = datetime.now().strftime('%Y-%m-%d') + if is_chinese(focus_statement): + get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"今天的日期是{date_stamp},{get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix + get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"今天的日期是{date_stamp},{get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix + else: + get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix_en + get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix_en + + recognized_img_cache = {} + working_list = set() + working_list.update(_sites) + await crawler.start() + while working_list: + url = working_list.pop() + existing_urls.add(url) + wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list') + has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) + if has_common_ext: + wiseflow_logger.debug(f'{url} is a common file, skip') + continue + + parsed_url = urlparse(url) + existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}") + existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/") + domain = parsed_url.netloc + if domain in custom_fetching_configs: + wiseflow_logger.debug(f'{url} will using custom crawl4ai run config') + run_config = custom_fetching_configs[domain] + else: + run_config = crawler_config + + run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED + result = await crawler.arun(url=url, config=run_config) + if not result.success: + wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') + continue + metadata_dict = result.metadata if result.metadata else {} + + if domain in custom_scrapers: + result = custom_scrapers[domain](result) + raw_markdown = result.content + used_img = result.images + title = result.title + base_url = result.base + author = result.author + publish_date = result.publish_date + else: + raw_markdown = result.markdown + media_dict = result.media if result.media else {} + used_img = [d['src'] for d in media_dict.get('images', [])] + title = '' + base_url = '' + author = '' + publish_date = '' + if not raw_markdown: + wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') + continue + + if not title: + title = metadata_dict.get('title', '') + if not base_url: + base_url = metadata_dict.get('base', '') + if not base_url: + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + + if not author: + author = metadata_dict.get('author', '') + if not publish_date: + publish_date = metadata_dict.get('publish_date', '') + + link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls) + + if link_dict and links_parts: + prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model] + links_texts = [] + for _parts in links_parts: + links_texts.extend(_parts.split('\n\n')) + more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger) + if more_url: + wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list') + working_list.update(more_url - existing_urls) + + if not contents: + continue + + if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na': + author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger) + + if not author or author.lower() == 'na': + author = parsed_url.netloc + + if publish_date: + publish_date = extract_and_convert_dates(publish_date) + else: + publish_date = date_stamp + + prompts = [get_info_sys_prompt, get_info_suffix_prompt, model] + infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger) + if infos: + wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb') + await save_to_pb(url, title, infos) + await crawler.close() + +if __name__ == '__main__': + + sites = pb.read('sites', filter='activated=True') + wiseflow_logger.info('execute all sites one time') + asyncio.run(main_process([site['url'] for site in sites])) diff --git a/core/windows_run.py b/core/windows_run.py new file mode 100644 index 0000000..a3f9d89 --- /dev/null +++ b/core/windows_run.py @@ -0,0 +1,86 @@ +import os +import sys +import subprocess +import socket +import psutil +from pathlib import Path +from dotenv import load_dotenv + +#檢查指定端口是否被使用 +def is_port_in_use(port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind(('127.0.0.1', port)) + return False + except socket.error: + return True + +#檢查指定進程是否在運行 +def is_process_running(process_name): + for proc in psutil.process_iter(['name']): + try: + if process_name.lower() in proc.info['name'].lower(): + return True + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + return False + +#啟動 PocketBase 服務 +def start_pocketbase(): + try: + # 檢查 PocketBase 是否已在運行 + if is_process_running('pocketbase'): + print("PocketBase is already running.") + return True + + # 檢查端口是否被佔用 + if is_port_in_use(8090): + print("Port 8090 is already in use.") + return False + + # 構建 PocketBase 路徑 + current_dir = Path(__file__).parent + pb_path = current_dir.parent / 'pb' / 'pocketbase.exe' # Windows 使用 .exe + + if not pb_path.exists(): + print(f"PocketBase executable not found at: {pb_path}") + return False + + # 啟動 PocketBase + print("Starting PocketBase...") + subprocess.Popen([ + str(pb_path), + 'serve', + '--http=127.0.0.1:8090' + ], + creationflags=subprocess.CREATE_NEW_CONSOLE) # Windows 特定標誌 + return True + + except Exception as e: + print(f"Error starting PocketBase: {e}") + return False + +def main(): + # 載入環境變數 + env_path = Path(__file__).parent / 'windows.env' + if env_path.exists(): + load_dotenv(env_path) + else: + print("Warning: .env file not found") + + # 啟動 PocketBase + if start_pocketbase(): + # 運行 Python 處理腳本 + try: + process_script = Path(__file__).parent / 'windows_general_process.py' + if process_script.exists(): + subprocess.run([sys.executable, str(process_script)], check=True) + else: + print(f"Error: general_process.py not found at: {process_script}") + except subprocess.CalledProcessError as e: + print(f"Error running general_process.py: {e}") + else: + print("Failed to start services") + +if __name__ == '__main__': + main() \ No newline at end of file