From 1f79cb3c4d5ab95865577029dd32ee00b474e32c Mon Sep 17 00:00:00 2001
From: bigbrother666sh <zeming.zhao@gmail.com>
Date: Sun, 5 Jan 2025 21:54:06 +0800
Subject: [PATCH] v0.3.6fix

---
 .gitignore              |  3 ++-
 core/agents/get_info.py | 12 +++++++-----
 core/general_process.py |  9 ++++++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index dd42108..42b3d06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ __pycache__
 pb/pb_data/
 pb/pocketbase
 core/docker_dir/
-core/work_dir/
\ No newline at end of file
+core/work_dir/
+test/webpage_samples/
\ No newline at end of file
diff --git a/core/agents/get_info.py b/core/agents/get_info.py
index f48f90b..0c5dbf3 100644
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@@ -87,7 +87,7 @@ class GeneralInfoExtractor:
 {focus_statement}\n
 在提炼摘要时，请遵循以下原则：
 - 理解每个关注点的含义以及进一步的解释（如有），确保摘要与关注点强相关并符合解释（如有）的范围
-- 摘要应当详实、充分
+- 摘要应当详实、充分，使用简体中文（如果原文是英文，请翻译成简体中文）
 - 摘要信息务必忠于原文'''
 
             self.get_info_suffix = '''请对关注点逐一生成摘要，不要遗漏任何关注点，如果网页文本与关注点无关，可以对应输出"NA"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
@@ -176,7 +176,8 @@ When performing the association analysis, please follow these principles:
                     model=self.model, temperature=0.1)
                 # self.logger.debug(f"llm output: {result}")
                 result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-                cache.add(result[-1])
+                if result:
+                    cache.add(result[-1])
                 text_batch = ''
 
         if text_batch:
@@ -186,7 +187,8 @@ When performing the association analysis, please follow these principles:
                 model=self.model, temperature=0.1)
             # self.logger.debug(f"llm output: {result}")
             result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-            cache.add(result[-1])
+            if result:
+                cache.add(result[-1])
         return cache
 
     async def get_more_related_urls(self, link_dict: dict) -> set:
@@ -215,7 +217,7 @@ When performing the association analysis, please follow these principles:
                 if focus not in self.focus_dict or _index not in link_map:
                     self.logger.debug(f"bad generate result: {item}")
                     continue
-                self.logger.debug(f"{link_map[_index]} selected")
+                # self.logger.debug(f"{link_map[_index]} selected")
                 final_result.add(link_map[_index])
         return final_result
 
@@ -223,6 +225,7 @@ When performing the association analysis, please follow these principles:
         raw_result = await self._generate_results(text.split('\n'), 'get_info')
         final = []
         for item in raw_result:
+            self.logger.debug(f"llm output:\n{item}")
             segs = item.split('//')
             i = 0
             while i < len(segs) - 1:
@@ -241,7 +244,6 @@ When performing the association analysis, please follow these principles:
                 """
                 maybe can use embedding retrieval to judge
                 """
-                self.logger.debug(f"get info: {focus}: {content}")
 
                 url_tags = re.findall(r'\[(Ref_\d+)]', content)
                 refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
diff --git a/core/general_process.py b/core/general_process.py
index f79e9f3..372b0c1 100644
--- a/core/general_process.py
+++ b/core/general_process.py
@@ -9,7 +9,9 @@ from custom_fetchings import *
 from urllib.parse import urlparse
 from crawl4ai import AsyncWebCrawler, CacheMode
 from datetime import datetime, timedelta
+import logging
 
+logging.getLogger("httpx").setLevel(logging.WARNING)
 
 project_dir = os.environ.get("PROJECT_DIR", "")
 if project_dir:
@@ -64,8 +66,8 @@ async def main_process(_sites: set | list):
                 raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
             else:
                 crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
-                result = await crawler.arun(url=url, delay_before_return_html=2.0, exclude_social_media_links=True,
-                                            magic=True, scan_full_page=True, remove_overlay_elements=True,
+                result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
+                                            magic=True, scan_full_page=True,
                                             cache_mode=crawl4ai_cache_mode)
                 if not result.success:
                     wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
@@ -122,8 +124,9 @@ async def main_process(_sites: set | list):
                         to_be_replaces[img_url].append("content")
                     else:
                         to_be_replaces[img_url] = ["content"]
-            wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized')
+
             recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
+            wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
             recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
             for img_url, content in recognized_result.items():
                 for u in to_be_replaces[img_url]: