From 1f79cb3c4d5ab95865577029dd32ee00b474e32c Mon Sep 17 00:00:00 2001 From: bigbrother666sh Date: Sun, 5 Jan 2025 21:54:06 +0800 Subject: [PATCH] v0.3.6fix --- .gitignore | 3 ++- core/agents/get_info.py | 12 +++++++----- core/general_process.py | 9 ++++++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index dd42108..42b3d06 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ __pycache__ pb/pb_data/ pb/pocketbase core/docker_dir/ -core/work_dir/ \ No newline at end of file +core/work_dir/ +test/webpage_samples/ \ No newline at end of file diff --git a/core/agents/get_info.py b/core/agents/get_info.py index f48f90b..0c5dbf3 100644 --- a/core/agents/get_info.py +++ b/core/agents/get_info.py @@ -87,7 +87,7 @@ class GeneralInfoExtractor: {focus_statement}\n 在提炼摘要时,请遵循以下原则: - 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围 -- 摘要应当详实、充分 +- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文) - 摘要信息务必忠于原文''' self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: @@ -176,7 +176,8 @@ When performing the association analysis, please follow these principles: model=self.model, temperature=0.1) # self.logger.debug(f"llm output: {result}") result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - cache.add(result[-1]) + if result: + cache.add(result[-1]) text_batch = '' if text_batch: @@ -186,7 +187,8 @@ When performing the association analysis, please follow these principles: model=self.model, temperature=0.1) # self.logger.debug(f"llm output: {result}") result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - cache.add(result[-1]) + if result: + cache.add(result[-1]) return cache async def get_more_related_urls(self, link_dict: dict) -> set: @@ -215,7 +217,7 @@ When performing the association analysis, please follow these principles: if focus not in self.focus_dict or _index not in link_map: self.logger.debug(f"bad generate result: {item}") continue - self.logger.debug(f"{link_map[_index]} selected") + # self.logger.debug(f"{link_map[_index]} selected") final_result.add(link_map[_index]) return final_result @@ -223,6 +225,7 @@ When performing the association analysis, please follow these principles: raw_result = await self._generate_results(text.split('\n'), 'get_info') final = [] for item in raw_result: + self.logger.debug(f"llm output:\n{item}") segs = item.split('//') i = 0 while i < len(segs) - 1: @@ -241,7 +244,6 @@ When performing the association analysis, please follow these principles: """ maybe can use embedding retrieval to judge """ - self.logger.debug(f"get info: {focus}: {content}") url_tags = re.findall(r'\[(Ref_\d+)]', content) refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links} diff --git a/core/general_process.py b/core/general_process.py index f79e9f3..372b0c1 100644 --- a/core/general_process.py +++ b/core/general_process.py @@ -9,7 +9,9 @@ from custom_fetchings import * from urllib.parse import urlparse from crawl4ai import AsyncWebCrawler, CacheMode from datetime import datetime, timedelta +import logging +logging.getLogger("httpx").setLevel(logging.WARNING) project_dir = os.environ.get("PROJECT_DIR", "") if project_dir: @@ -64,8 +66,8 @@ async def main_process(_sites: set | list): raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url) else: crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED - result = await crawler.arun(url=url, delay_before_return_html=2.0, exclude_social_media_links=True, - magic=True, scan_full_page=True, remove_overlay_elements=True, + result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit', + magic=True, scan_full_page=True, cache_mode=crawl4ai_cache_mode) if not result.success: wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') @@ -122,8 +124,9 @@ async def main_process(_sites: set | list): to_be_replaces[img_url].append("content") else: to_be_replaces[img_url] = ["content"] - wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized') + recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model) + wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized') recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()}) for img_url, content in recognized_result.items(): for u in to_be_replaces[img_url]: