diff --git a/core/agents/get_info.py b/core/agents/get_info.py
index 9974749..1f5e033 100644
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@@ -265,9 +265,9 @@ async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list
for link in links:
if link not in text_batch:
if _logger:
- _logger.warning(f"model generating hallucination:\n{result[-1]}")
+ _logger.warning(f"model generating hallucination:\n{link}\n{result[-1]}\n{text_batch}")
if test_mode:
- print(f"model hallucination:\n{result[-1]}")
+ print(f"model hallucination:\n{link}\n{result[-1]}\n{text_batch}")
continue
cache.add(link)
text_batch = ''
@@ -343,5 +343,5 @@ async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_
url_tags = re.findall(r'\[\d+\]', content)
refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
-
+
return final
diff --git a/core/agents/get_info_prompts.py b/core/agents/get_info_prompts.py
index f4c917b..9e8f433 100644
--- a/core/agents/get_info_prompts.py
+++ b/core/agents/get_info_prompts.py
@@ -36,8 +36,8 @@ get_info_system = '''你将被给到一段使用标签包裹的网
{focus_statement}\n
在提炼摘要时,请遵循以下原则:
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
-- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文)
-- 摘要信息务必忠于原文'''
+- 摘要应当详实、充分,且绝对忠于原文
+- 如果摘要涉及的原文片段中包含类似"[3]"这样的引用标记,务必在摘要中保留相关标记'''
get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
@@ -55,8 +55,8 @@ get_info_system_en = '''You will be given a webpage text wrapped in str:
resp = response.choices[0].message.content
except Exception as e:
if logger:
- logger.warning(f'{e}\nRetrying in 60 second...')
+ logger.warning(e)
else:
- print(f'{e}\nRetrying in 60 second...')
- await asyncio.sleep(60)
- response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
- if response.status_code == 200 and response.choices:
- resp = response.choices[0].message.content
- else:
- if logger:
- logger.error(f'after many try, llm error: {response}')
- else:
- print(f'after many try, llm error: {response}')
+ print(e)
finally:
semaphore.release()
diff --git a/core/scrapers/mp_scraper.py b/core/scrapers/mp_scraper.py
index 0dfff67..649e69d 100644
--- a/core/scrapers/mp_scraper.py
+++ b/core/scrapers/mp_scraper.py
@@ -183,14 +183,16 @@ def mp_scraper(fetch_result: CrawlResult | dict) -> ScraperResultData:
if text:
content_parts.append(text)
# 只在块级元素后添加换行符
- if element.name in {'div', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
+ if element.name in {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
content_parts.append('\n')
+ if element.name in {'div', 'section'}:
+ content_parts.append('# ')
elif isinstance(element, str):
text = element.strip()
if text:
content_parts.append(text)
- return ' '.join(content_parts).strip()
+ return ''.join(content_parts).strip()
soup = BeautifulSoup(cleaned_html, 'html.parser')
diff --git a/core/tasks.py b/core/tasks.py
index f1338c7..573c882 100644
--- a/core/tasks.py
+++ b/core/tasks.py
@@ -15,7 +15,7 @@ async def schedule_pipeline(interval):
continue
if counter % site['per_hours'] == 0:
wiseflow_logger.info(f"applying {site['url']}")
- todo_urls.add(site['url'].rstrip('/'))
+ todo_urls.add(site['url'])
counter += 1
await main_process(todo_urls)
diff --git a/test/README.md b/test/README.md
index a6dcd26..975bbff 100644
--- a/test/README.md
+++ b/test/README.md
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
## html 内容解析
-[deep_scraper_test.py](./deep_scraper_test.py)
+[pre_process_test.py](./pre_process_test.py)
```
-python deep_scraper_test.py -F 'json_file_path'
-```
-
-## 视觉大模型信息提取
-
-[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
-
-```
-python get_visual_info_for_samples.py -F 'json_file_path'
+python pre_process_test.py -F 'json_file_path' -R 'record save path'
```
## 大模型信息提取测试
@@ -30,11 +22,11 @@ python get_visual_info_for_samples.py -F 'json_file_path'
- 为测试任务创建 关注点说明,可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json),
- - 要更改 get_info 的 prompt,请编辑 [prompts.py](./prompts.py)
+```
+python get_info_test.py -D 'sample dir' -I 'include ap'
+```
-```
-python get_info_test.py -D 'sample dir'
-```
+*-I 是否需要同时测试 llm提取作者和发布时间*
# 结果提交与共享
diff --git a/test/README_EN.md b/test/README_EN.md
index c6e9675..d8dcabf 100644
--- a/test/README_EN.md
+++ b/test/README_EN.md
@@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
## HTML Content Parsing
-[deep_scraper_test.py](./deep_scraper_test.py)
+[pre_process_test.py](./pre_process_test.py)
```
-python deep_scraper_test.py -F 'json_file_path'
-```
-
-## Visual Large Model Information Extraction
-
-[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
-
-```
-python get_visual_info_for_samples.py -F 'json_file_path'
+python pre_process_test.py -F 'json_file_path' -R 'record save path'
```
## Large Model Information Extraction Testing
@@ -30,11 +22,11 @@ python get_visual_info_for_samples.py -F 'json_file_path'
- To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json)
- - To modify the prompt for get_info, edit [prompts.py](./prompts.py)
+```
+python get_info_test.py -D 'sample dir' -I 'include ap'
+```
-```
-python get_info_test.py -D 'sample dir'
-```
+*-I whether to test LLM extraction of author and publish date*
# Result Submission and Sharing
diff --git a/test/pre_process_test.py b/test/pre_process_test.py
index cb00473..bf969f8 100644
--- a/test/pre_process_test.py
+++ b/test/pre_process_test.py
@@ -89,8 +89,8 @@ async def main(html_sample, record_file):
base_url = html_sample.get('base', '')
if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
- if not base_url.endswith('/'):
- base_url = base_url.rsplit('/', 1)[0] + '/'
+
+ print('base_url:', base_url)
if not author:
author = html_sample.get('author', '')