mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
v0.3.6fix
This commit is contained in:
parent
35fbff0f27
commit
1f79cb3c4d
1
.gitignore
vendored
1
.gitignore
vendored
@ -10,3 +10,4 @@ pb/pb_data/
|
|||||||
pb/pocketbase
|
pb/pocketbase
|
||||||
core/docker_dir/
|
core/docker_dir/
|
||||||
core/work_dir/
|
core/work_dir/
|
||||||
|
test/webpage_samples/
|
@ -87,7 +87,7 @@ class GeneralInfoExtractor:
|
|||||||
{focus_statement}\n
|
{focus_statement}\n
|
||||||
在提炼摘要时,请遵循以下原则:
|
在提炼摘要时,请遵循以下原则:
|
||||||
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
|
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
|
||||||
- 摘要应当详实、充分
|
- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文)
|
||||||
- 摘要信息务必忠于原文'''
|
- 摘要信息务必忠于原文'''
|
||||||
|
|
||||||
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
|
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
|
||||||
@ -176,7 +176,8 @@ When performing the association analysis, please follow these principles:
|
|||||||
model=self.model, temperature=0.1)
|
model=self.model, temperature=0.1)
|
||||||
# self.logger.debug(f"llm output: {result}")
|
# self.logger.debug(f"llm output: {result}")
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||||
cache.add(result[-1])
|
if result:
|
||||||
|
cache.add(result[-1])
|
||||||
text_batch = ''
|
text_batch = ''
|
||||||
|
|
||||||
if text_batch:
|
if text_batch:
|
||||||
@ -186,7 +187,8 @@ When performing the association analysis, please follow these principles:
|
|||||||
model=self.model, temperature=0.1)
|
model=self.model, temperature=0.1)
|
||||||
# self.logger.debug(f"llm output: {result}")
|
# self.logger.debug(f"llm output: {result}")
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||||
cache.add(result[-1])
|
if result:
|
||||||
|
cache.add(result[-1])
|
||||||
return cache
|
return cache
|
||||||
|
|
||||||
async def get_more_related_urls(self, link_dict: dict) -> set:
|
async def get_more_related_urls(self, link_dict: dict) -> set:
|
||||||
@ -215,7 +217,7 @@ When performing the association analysis, please follow these principles:
|
|||||||
if focus not in self.focus_dict or _index not in link_map:
|
if focus not in self.focus_dict or _index not in link_map:
|
||||||
self.logger.debug(f"bad generate result: {item}")
|
self.logger.debug(f"bad generate result: {item}")
|
||||||
continue
|
continue
|
||||||
self.logger.debug(f"{link_map[_index]} selected")
|
# self.logger.debug(f"{link_map[_index]} selected")
|
||||||
final_result.add(link_map[_index])
|
final_result.add(link_map[_index])
|
||||||
return final_result
|
return final_result
|
||||||
|
|
||||||
@ -223,6 +225,7 @@ When performing the association analysis, please follow these principles:
|
|||||||
raw_result = await self._generate_results(text.split('\n'), 'get_info')
|
raw_result = await self._generate_results(text.split('\n'), 'get_info')
|
||||||
final = []
|
final = []
|
||||||
for item in raw_result:
|
for item in raw_result:
|
||||||
|
self.logger.debug(f"llm output:\n{item}")
|
||||||
segs = item.split('//')
|
segs = item.split('//')
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(segs) - 1:
|
while i < len(segs) - 1:
|
||||||
@ -241,7 +244,6 @@ When performing the association analysis, please follow these principles:
|
|||||||
"""
|
"""
|
||||||
maybe can use embedding retrieval to judge
|
maybe can use embedding retrieval to judge
|
||||||
"""
|
"""
|
||||||
self.logger.debug(f"get info: {focus}: {content}")
|
|
||||||
|
|
||||||
url_tags = re.findall(r'\[(Ref_\d+)]', content)
|
url_tags = re.findall(r'\[(Ref_\d+)]', content)
|
||||||
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
|
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
|
||||||
|
@ -9,7 +9,9 @@ from custom_fetchings import *
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
if project_dir:
|
if project_dir:
|
||||||
@ -64,8 +66,8 @@ async def main_process(_sites: set | list):
|
|||||||
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
|
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
|
||||||
else:
|
else:
|
||||||
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
|
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
|
||||||
result = await crawler.arun(url=url, delay_before_return_html=2.0, exclude_social_media_links=True,
|
result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
|
||||||
magic=True, scan_full_page=True, remove_overlay_elements=True,
|
magic=True, scan_full_page=True,
|
||||||
cache_mode=crawl4ai_cache_mode)
|
cache_mode=crawl4ai_cache_mode)
|
||||||
if not result.success:
|
if not result.success:
|
||||||
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
|
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
|
||||||
@ -122,8 +124,9 @@ async def main_process(_sites: set | list):
|
|||||||
to_be_replaces[img_url].append("content")
|
to_be_replaces[img_url].append("content")
|
||||||
else:
|
else:
|
||||||
to_be_replaces[img_url] = ["content"]
|
to_be_replaces[img_url] = ["content"]
|
||||||
wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized')
|
|
||||||
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
|
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
|
||||||
|
wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
|
||||||
recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
|
recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
|
||||||
for img_url, content in recognized_result.items():
|
for img_url, content in recognized_result.items():
|
||||||
for u in to_be_replaces[img_url]:
|
for u in to_be_replaces[img_url]:
|
||||||
|
Loading…
Reference in New Issue
Block a user