llm wrapper and prompt opz, base url bug, mp scraper opz

This commit is contained in:
bigbrother666sh 2025-01-18 13:47:47 +08:00
parent dd7d92476e
commit 3e07d63757
9 changed files with 38 additions and 60 deletions

View File

@ -265,9 +265,9 @@ async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list
for link in links: for link in links:
if link not in text_batch: if link not in text_batch:
if _logger: if _logger:
_logger.warning(f"model generating hallucination:\n{result[-1]}") _logger.warning(f"model generating hallucination:\n{link}\n{result[-1]}\n{text_batch}")
if test_mode: if test_mode:
print(f"model hallucination:\n{result[-1]}") print(f"model hallucination:\n{link}\n{result[-1]}\n{text_batch}")
continue continue
cache.add(link) cache.add(link)
text_batch = '' text_batch = ''
@ -343,5 +343,5 @@ async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_
url_tags = re.findall(r'\[\d+\]', content) url_tags = re.findall(r'\[\d+\]', content)
refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict} refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences}) final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
return final return final

View File

@ -36,8 +36,8 @@ get_info_system = '''你将被给到一段使用<text></text>标签包裹的网
{focus_statement}\n {focus_statement}\n
在提炼摘要时请遵循以下原则 在提炼摘要时请遵循以下原则
- 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围 - 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围
- 摘要应当详实充分使用简体中文如果原文是英文请翻译成简体中文 - 摘要应当详实充分且绝对忠于原文
- 摘要信息务必忠于原文''' - 如果摘要涉及的原文片段中包含类似"[3]"这样的引用标记务必在摘要中保留相关标记'''
get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
""" """
@ -55,8 +55,8 @@ get_info_system_en = '''You will be given a webpage text wrapped in <text></text
{focus_statement}\n {focus_statement}\n
When extracting summaries, please follow these principles: When extracting summaries, please follow these principles:
- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any) - Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
- The summary should be detailed and comprehensive - The summary should be detailed and comprehensive and absolutely faithful to the original text
- The summary should be faithful to the original text''' - If the summary involves a reference marker like "[3]", it must be retained in the summary'''
get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format: get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
""" """

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
from utils.pb_api import PbTalker from utils.pb_api import PbTalker
from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
from agents.get_info import * from agents.get_info import *
@ -87,9 +88,10 @@ async def main_process(_sites: set | list):
while working_list: while working_list:
url = working_list.pop() url = working_list.pop()
existing_urls.add(url) existing_urls.add(url)
wiseflow_logger.debug(f'process new url, still {len(working_list)} urls in working list')
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
if has_common_ext: if has_common_ext:
wiseflow_logger.info(f'{url} is a common file, skip') wiseflow_logger.debug(f'{url} is a common file, skip')
continue continue
parsed_url = urlparse(url) parsed_url = urlparse(url)
@ -125,7 +127,6 @@ async def main_process(_sites: set | list):
base_url = '' base_url = ''
author = '' author = ''
publish_date = '' publish_date = ''
if not raw_markdown: if not raw_markdown:
wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
continue continue
@ -136,17 +137,14 @@ async def main_process(_sites: set | list):
base_url = metadata_dict.get('base', '') base_url = metadata_dict.get('base', '')
if not base_url: if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
# 如果路径不以 / 结尾,则去掉最后一个路径段
base_url = base_url.rsplit('/', 1)[0] + '/'
if not author: if not author:
author = metadata_dict.get('author', '') author = metadata_dict.get('author', '')
if not publish_date: if not publish_date:
publish_date = metadata_dict.get('publish_date', '') publish_date = metadata_dict.get('publish_date', '')
link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls) link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
if link_dict and links_parts: if link_dict and links_parts:
prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model] prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
links_texts = [] links_texts = []
@ -154,6 +152,7 @@ async def main_process(_sites: set | list):
links_texts.extend(_parts.split('\n\n')) links_texts.extend(_parts.split('\n\n'))
more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger) more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
if more_url: if more_url:
wiseflow_logger.debug(f'get {len(more_url)} more related urls, will add to working list')
working_list.update(more_url - existing_urls) working_list.update(more_url - existing_urls)
if not contents: if not contents:
@ -173,10 +172,12 @@ async def main_process(_sites: set | list):
prompts = [get_info_sys_prompt, get_info_suffix_prompt, model] prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger) infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
if infos: if infos:
wiseflow_logger.debug(f'get {len(infos)} infos, will save to pb')
await save_to_pb(url, title, infos) await save_to_pb(url, title, infos)
await crawler.close() await crawler.close()
if __name__ == '__main__': if __name__ == '__main__':
sites = pb.read('sites', filter='activated=True') sites = pb.read('sites', filter='activated=True')
wiseflow_logger.info('execute all sites one time') wiseflow_logger.info('execute all sites one time')
asyncio.run(main_process([site['url'].rstrip('/') for site in sites])) asyncio.run(main_process([site['url'] for site in sites]))

View File

@ -32,18 +32,9 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
resp = response.choices[0].message.content resp = response.choices[0].message.content
except Exception as e: except Exception as e:
if logger: if logger:
logger.warning(f'{e}\nRetrying in 60 second...') logger.warning(e)
else: else:
print(f'{e}\nRetrying in 60 second...') print(e)
await asyncio.sleep(60)
response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
if response.status_code == 200 and response.choices:
resp = response.choices[0].message.content
else:
if logger:
logger.error(f'after many try, llm error: {response}')
else:
print(f'after many try, llm error: {response}')
finally: finally:
semaphore.release() semaphore.release()

View File

@ -183,14 +183,16 @@ def mp_scraper(fetch_result: CrawlResult | dict) -> ScraperResultData:
if text: if text:
content_parts.append(text) content_parts.append(text)
# 只在块级元素后添加换行符 # 只在块级元素后添加换行符
if element.name in {'div', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}: if element.name in {'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}:
content_parts.append('\n') content_parts.append('\n')
if element.name in {'div', 'section'}:
content_parts.append('# ')
elif isinstance(element, str): elif isinstance(element, str):
text = element.strip() text = element.strip()
if text: if text:
content_parts.append(text) content_parts.append(text)
return ' '.join(content_parts).strip() return ''.join(content_parts).strip()
soup = BeautifulSoup(cleaned_html, 'html.parser') soup = BeautifulSoup(cleaned_html, 'html.parser')

View File

@ -15,7 +15,7 @@ async def schedule_pipeline(interval):
continue continue
if counter % site['per_hours'] == 0: if counter % site['per_hours'] == 0:
wiseflow_logger.info(f"applying {site['url']}") wiseflow_logger.info(f"applying {site['url']}")
todo_urls.add(site['url'].rstrip('/')) todo_urls.add(site['url'])
counter += 1 counter += 1
await main_process(todo_urls) await main_process(todo_urls)

View File

@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
## html 内容解析 ## html 内容解析
[deep_scraper_test.py](./deep_scraper_test.py) [pre_process_test.py](./pre_process_test.py)
``` ```
python deep_scraper_test.py -F 'json_file_path' python pre_process_test.py -F 'json_file_path' -R 'record save path'
```
## 视觉大模型信息提取
[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
```
python get_visual_info_for_samples.py -F 'json_file_path'
``` ```
## 大模型信息提取测试 ## 大模型信息提取测试
@ -30,11 +22,11 @@ python get_visual_info_for_samples.py -F 'json_file_path'
- 为测试任务创建 关注点说明,可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json), - 为测试任务创建 关注点说明,可以参考 [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json),
- 要更改 get_info 的 prompt请编辑 [prompts.py](./prompts.py) ```
python get_info_test.py -D 'sample dir' -I 'include ap'
```
``` *-I 是否需要同时测试 llm提取作者和发布时间*
python get_info_test.py -D 'sample dir'
```
# 结果提交与共享 # 结果提交与共享

View File

@ -10,18 +10,10 @@ python craw4ai_fetching.py -S 'url1,url2...'
## HTML Content Parsing ## HTML Content Parsing
[deep_scraper_test.py](./deep_scraper_test.py) [pre_process_test.py](./pre_process_test.py)
``` ```
python deep_scraper_test.py -F 'json_file_path' python pre_process_test.py -F 'json_file_path' -R 'record save path'
```
## Visual Large Model Information Extraction
[get_visual_info_for_samples.py](./get_visual_info_for_samples.py)
```
python get_visual_info_for_samples.py -F 'json_file_path'
``` ```
## Large Model Information Extraction Testing ## Large Model Information Extraction Testing
@ -30,11 +22,11 @@ python get_visual_info_for_samples.py -F 'json_file_path'
- To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json) - To create focus point descriptions for test tasks, refer to [reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json](./reports/wiseflow_report_v036_bigbrother666/task0/focus_point.json)
- To modify the prompt for get_info, edit [prompts.py](./prompts.py) ```
python get_info_test.py -D 'sample dir' -I 'include ap'
```
``` *-I whether to test LLM extraction of author and publish date*
python get_info_test.py -D 'sample dir'
```
# Result Submission and Sharing # Result Submission and Sharing

View File

@ -89,8 +89,8 @@ async def main(html_sample, record_file):
base_url = html_sample.get('base', '') base_url = html_sample.get('base', '')
if not base_url: if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
base_url = base_url.rsplit('/', 1)[0] + '/' print('base_url:', base_url)
if not author: if not author:
author = html_sample.get('author', '') author = html_sample.get('author', '')