v0.3.6fix

This commit is contained in:
bigbrother666sh 2025-01-05 21:54:06 +08:00
parent 35fbff0f27
commit 1f79cb3c4d
3 changed files with 15 additions and 9 deletions

1
.gitignore vendored
View File

@ -10,3 +10,4 @@ pb/pb_data/
pb/pocketbase pb/pocketbase
core/docker_dir/ core/docker_dir/
core/work_dir/ core/work_dir/
test/webpage_samples/

View File

@ -87,7 +87,7 @@ class GeneralInfoExtractor:
{focus_statement}\n {focus_statement}\n
在提炼摘要时请遵循以下原则 在提炼摘要时请遵循以下原则
- 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围 - 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围
- 摘要应当详实充分 - 摘要应当详实充分使用简体中文如果原文是英文请翻译成简体中文
- 摘要信息务必忠于原文''' - 摘要信息务必忠于原文'''
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
@ -176,7 +176,8 @@ When performing the association analysis, please follow these principles:
model=self.model, temperature=0.1) model=self.model, temperature=0.1)
# self.logger.debug(f"llm output: {result}") # self.logger.debug(f"llm output: {result}")
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
cache.add(result[-1]) if result:
cache.add(result[-1])
text_batch = '' text_batch = ''
if text_batch: if text_batch:
@ -186,7 +187,8 @@ When performing the association analysis, please follow these principles:
model=self.model, temperature=0.1) model=self.model, temperature=0.1)
# self.logger.debug(f"llm output: {result}") # self.logger.debug(f"llm output: {result}")
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
cache.add(result[-1]) if result:
cache.add(result[-1])
return cache return cache
async def get_more_related_urls(self, link_dict: dict) -> set: async def get_more_related_urls(self, link_dict: dict) -> set:
@ -215,7 +217,7 @@ When performing the association analysis, please follow these principles:
if focus not in self.focus_dict or _index not in link_map: if focus not in self.focus_dict or _index not in link_map:
self.logger.debug(f"bad generate result: {item}") self.logger.debug(f"bad generate result: {item}")
continue continue
self.logger.debug(f"{link_map[_index]} selected") # self.logger.debug(f"{link_map[_index]} selected")
final_result.add(link_map[_index]) final_result.add(link_map[_index])
return final_result return final_result
@ -223,6 +225,7 @@ When performing the association analysis, please follow these principles:
raw_result = await self._generate_results(text.split('\n'), 'get_info') raw_result = await self._generate_results(text.split('\n'), 'get_info')
final = [] final = []
for item in raw_result: for item in raw_result:
self.logger.debug(f"llm output:\n{item}")
segs = item.split('//') segs = item.split('//')
i = 0 i = 0
while i < len(segs) - 1: while i < len(segs) - 1:
@ -241,7 +244,6 @@ When performing the association analysis, please follow these principles:
""" """
maybe can use embedding retrieval to judge maybe can use embedding retrieval to judge
""" """
self.logger.debug(f"get info: {focus}: {content}")
url_tags = re.findall(r'\[(Ref_\d+)]', content) url_tags = re.findall(r'\[(Ref_\d+)]', content)
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links} refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}

View File

@ -9,7 +9,9 @@ from custom_fetchings import *
from urllib.parse import urlparse from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode
from datetime import datetime, timedelta from datetime import datetime, timedelta
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)
project_dir = os.environ.get("PROJECT_DIR", "") project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir: if project_dir:
@ -64,8 +66,8 @@ async def main_process(_sites: set | list):
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url) raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
else: else:
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
result = await crawler.arun(url=url, delay_before_return_html=2.0, exclude_social_media_links=True, result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
magic=True, scan_full_page=True, remove_overlay_elements=True, magic=True, scan_full_page=True,
cache_mode=crawl4ai_cache_mode) cache_mode=crawl4ai_cache_mode)
if not result.success: if not result.success:
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
@ -122,8 +124,9 @@ async def main_process(_sites: set | list):
to_be_replaces[img_url].append("content") to_be_replaces[img_url].append("content")
else: else:
to_be_replaces[img_url] = ["content"] to_be_replaces[img_url] = ["content"]
wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized')
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model) recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()}) recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
for img_url, content in recognized_result.items(): for img_url, content in recognized_result.items():
for u in to_be_replaces[img_url]: for u in to_be_replaces[img_url]: