mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
v0.3.6fix
This commit is contained in:
parent
35fbff0f27
commit
1f79cb3c4d
3
.gitignore
vendored
3
.gitignore
vendored
@ -9,4 +9,5 @@ __pycache__
|
||||
pb/pb_data/
|
||||
pb/pocketbase
|
||||
core/docker_dir/
|
||||
core/work_dir/
|
||||
core/work_dir/
|
||||
test/webpage_samples/
|
@ -87,7 +87,7 @@ class GeneralInfoExtractor:
|
||||
{focus_statement}\n
|
||||
在提炼摘要时,请遵循以下原则:
|
||||
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
|
||||
- 摘要应当详实、充分
|
||||
- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文)
|
||||
- 摘要信息务必忠于原文'''
|
||||
|
||||
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
|
||||
@ -176,7 +176,8 @@ When performing the association analysis, please follow these principles:
|
||||
model=self.model, temperature=0.1)
|
||||
# self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
cache.add(result[-1])
|
||||
if result:
|
||||
cache.add(result[-1])
|
||||
text_batch = ''
|
||||
|
||||
if text_batch:
|
||||
@ -186,7 +187,8 @@ When performing the association analysis, please follow these principles:
|
||||
model=self.model, temperature=0.1)
|
||||
# self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
cache.add(result[-1])
|
||||
if result:
|
||||
cache.add(result[-1])
|
||||
return cache
|
||||
|
||||
async def get_more_related_urls(self, link_dict: dict) -> set:
|
||||
@ -215,7 +217,7 @@ When performing the association analysis, please follow these principles:
|
||||
if focus not in self.focus_dict or _index not in link_map:
|
||||
self.logger.debug(f"bad generate result: {item}")
|
||||
continue
|
||||
self.logger.debug(f"{link_map[_index]} selected")
|
||||
# self.logger.debug(f"{link_map[_index]} selected")
|
||||
final_result.add(link_map[_index])
|
||||
return final_result
|
||||
|
||||
@ -223,6 +225,7 @@ When performing the association analysis, please follow these principles:
|
||||
raw_result = await self._generate_results(text.split('\n'), 'get_info')
|
||||
final = []
|
||||
for item in raw_result:
|
||||
self.logger.debug(f"llm output:\n{item}")
|
||||
segs = item.split('//')
|
||||
i = 0
|
||||
while i < len(segs) - 1:
|
||||
@ -241,7 +244,6 @@ When performing the association analysis, please follow these principles:
|
||||
"""
|
||||
maybe can use embedding retrieval to judge
|
||||
"""
|
||||
self.logger.debug(f"get info: {focus}: {content}")
|
||||
|
||||
url_tags = re.findall(r'\[(Ref_\d+)]', content)
|
||||
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
|
||||
|
@ -9,7 +9,9 @@ from custom_fetchings import *
|
||||
from urllib.parse import urlparse
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
if project_dir:
|
||||
@ -64,8 +66,8 @@ async def main_process(_sites: set | list):
|
||||
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
|
||||
else:
|
||||
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
|
||||
result = await crawler.arun(url=url, delay_before_return_html=2.0, exclude_social_media_links=True,
|
||||
magic=True, scan_full_page=True, remove_overlay_elements=True,
|
||||
result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
|
||||
magic=True, scan_full_page=True,
|
||||
cache_mode=crawl4ai_cache_mode)
|
||||
if not result.success:
|
||||
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
|
||||
@ -122,8 +124,9 @@ async def main_process(_sites: set | list):
|
||||
to_be_replaces[img_url].append("content")
|
||||
else:
|
||||
to_be_replaces[img_url] = ["content"]
|
||||
wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized')
|
||||
|
||||
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
|
||||
wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
|
||||
recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
|
||||
for img_url, content in recognized_result.items():
|
||||
for u in to_be_replaces[img_url]:
|
||||
|
Loading…
Reference in New Issue
Block a user