0.3.7 release

This commit is contained in:
bigbrother666sh 2025-01-17 23:28:22 +08:00
parent e2f3903bb8
commit dd7d92476e
18 changed files with 764 additions and 839 deletions

View File

@ -115,6 +115,7 @@ siliconflow硅基流动提供大部分主流开源模型的在线 MaaS 服
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
export VL_MODEL="OpenGVLab/InternVL2-26B"
```
@ -129,6 +130,7 @@ export VL_MODEL="OpenGVLab/InternVL2-26B"
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://aihubmix.com/v1" # 具体参考 https://doc.aihubmix.com/
export PRIMARY_MODEL="gpt-4o"
export SECONDARY_MODEL="gpt-4o-mini"
export VL_MODEL="gpt-4o"
```

View File

@ -114,6 +114,7 @@ Siliconflow provides online MaaS services for most mainstream open-source models
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
export VL_MODEL="OpenGVLab/InternVL2-26B"
```
@ -129,6 +130,7 @@ When using AiHubMix models, the .env configuration can refer to the following:
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://aihubmix.com/v1" # refer to https://doc.aihubmix.com/
export PRIMARY_MODEL="gpt-4o"
export SECONDARY_MODEL="gpt-4o-mini"
export VL_MODEL="gpt-4o"
```

View File

@ -114,6 +114,7 @@ Siliconflowは、主流のオープンソースモデルのほとんどにオン
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
export VL_MODEL="OpenGVLab/InternVL2-26B"
```
@ -129,6 +130,7 @@ AiHubMixモデルを使用する場合、.envの設定は以下を参考にし
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://aihubmix.com/v1" # referhttps://doc.aihubmix.com/
export PRIMARY_MODEL="gpt-4o"
export SECONDARY_MODEL="gpt-4o-mini"
export VL_MODEL="gpt-4o"
```
😄 [AiHubMixの紹介リンク](https://aihubmix.com?aff=Gp54)からご登録いただけますと幸いです 🌹

View File

@ -114,6 +114,7 @@ Siliconflow는 대부분의 주류 오픈소스 모델에 대한 온라인 MaaS
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
export VL_MODEL="OpenGVLab/InternVL2-26B"
```
@ -129,6 +130,7 @@ AiHubMix 모델을 사용할 때 .env 구성은 다음을 참조할 수 있습
export LLM_API_KEY=Your_API_KEY
export LLM_API_BASE="https://aihubmix.com/v1" # refer https://doc.aihubmix.com/
export PRIMARY_MODEL="gpt-4o"
export SECONDARY_MODEL="gpt-4o-mini"
export VL_MODEL="gpt-4o"
```

View File

@ -1,15 +1,222 @@
# -*- coding: utf-8 -*-
import asyncio
from loguru import logger
import os, re
from utils.pb_api import PbTalker
from llms.openai_wrapper import openai_llm as llm
# from core.llms.siliconflow_wrapper import sfa_llm # or other llm wrapper
from utils.general_utils import is_chinese, extract_and_convert_dates
from utils.general_utils import is_chinese, extract_and_convert_dates, normalize_url
from .get_info_prompts import *
async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]:
common_file_exts = [
'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
]
common_tlds = [
'.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
'.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
'.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
'.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
]
async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
recognized_img_cache: dict, existing_urls: set = set(),
test_mode: bool = False) -> tuple[dict, list[str], list[str], dict]:
link_dict = {}
# for special url formate from crawl4ai 0.4.247
raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
# 处理图片标记 ![alt](src)
i_pattern = r'(!\[(.*?)\]\((.*?)\))'
matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
for _sec, alt, src in matches:
# 替换为新格式 §alt||src§
raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
async def check_url_text(text) -> tuple[int, str]:
score = 0
_valid_len = len(text.strip())
# 找到所有[part0](part1)格式的片段
link_pattern = r'(\[(.*?)\]\((.*?)\))'
matches = re.findall(link_pattern, text, re.DOTALL)
for _sec, link_text, link_url in matches:
# 处理 \"***\" 格式的片段
quote_pattern = r'\"(.*?)\"'
# 提取所有引号包裹的内容
_title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
_title = _title.strip()
link_text = link_text.strip()
if _title and _title not in link_text:
link_text = f"{_title} - {link_text}"
real_url_pattern = r'<(.*?)>'
real_url = re.search(real_url_pattern, link_url, re.DOTALL)
if real_url:
_url = real_url.group(1).strip()
else:
_url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
if not _url or _url.startswith(('#', 'javascript:')):
text = text.replace(_sec, link_text, 1)
continue
score += 1
_valid_len = _valid_len - len(_sec)
url = normalize_url(_url, base_url)
# 分离§§内的内容和后面的内容
img_marker_pattern = r'§(.*?)\|\|(.*?)§'
inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
for alt, src in inner_matches:
link_text = link_text.replace(f'§{alt}||{src}§', '')
if not link_text and inner_matches:
img_alt = inner_matches[0][0].strip()
img_src = inner_matches[0][1].strip()
if img_src and not img_src.startswith('#'):
img_src = normalize_url(img_src, base_url)
if not img_src:
link_text = img_alt
elif len(img_alt) > 2 or url in existing_urls:
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
else:
if img_src not in recognized_img_cache:
recognized_img_cache[img_src] = await extract_info_from_img(img_src)
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = recognized_img_cache[img_src] + _key
else:
link_text = img_alt
_key = f"[{len(link_dict)+1}]"
link_dict[_key] = url
text = text.replace(_sec, link_text + _key, 1)
# 处理文本中的其他图片标记
img_pattern = r'(§(.*?)\|\|(.*?)§)'
matches = re.findall(img_pattern, text, re.DOTALL)
remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
remained_text_len = len(remained_text)
for _sec, alt, src in matches:
if not src or src.startswith('#') or src not in used_img:
text = text.replace(_sec, alt, 1)
continue
img_src = normalize_url(src, base_url)
if not img_src:
text = text.replace(_sec, alt, 1)
elif remained_text_len > 5 or len(alt) > 2:
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
else:
if img_src not in recognized_img_cache:
recognized_img_cache[img_src] = await extract_info_from_img(img_src)
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, recognized_img_cache[img_src] + _key, 1)
# 处理文本中的"野 url"
url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
matches = re.findall(url_pattern, text)
for url in matches:
url = normalize_url(url, base_url)
_key = f"[{len(link_dict)+1}]"
link_dict[_key] = url
text = text.replace(url, _key, 1)
score += 1
_valid_len = _valid_len - len(url)
# 统计换行符数量
newline_count = text.count(' * ')
score += newline_count
ratio = _valid_len/score if score != 0 else 999
return ratio, text
sections = raw_markdown.split('# ') # use '# ' to avoid # in url
if len(sections) > 2:
_sec = sections[0]
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
section_remain_len = len(section_remain)
total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
ratio = total_links / section_remain_len if section_remain_len != 0 else 1
if ratio > 0.05:
if test_mode:
print('this is a navigation section, will be removed')
print(ratio)
print(section_remain)
print('-' * 50)
sections = sections[1:]
_sec = sections[-1]
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
section_remain_len = len(section_remain)
if section_remain_len < 198:
if test_mode:
print('this is a footer section, will be removed')
print(section_remain_len)
print(section_remain)
print('-' * 50)
sections = sections[:-1]
links_parts = []
contents = []
for section in sections:
ratio, text = await check_url_text(section)
if ratio < 70:
if test_mode:
print('this is a links part')
print(ratio)
print(text)
print('-' * 50)
links_parts.append(text)
else:
if test_mode:
print('this is a content part')
print(ratio)
print(text)
print('-' * 50)
contents.append(text)
return link_dict, links_parts, contents, recognized_img_cache
vl_model = os.environ.get("VL_MODEL", "")
if not vl_model:
print("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
async def extract_info_from_img(url: str) -> str:
if not vl_model:
return '§to_be_recognized_by_visual_llm§'
llm_output = await llm([{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
{"type": "text", "text": "提取图片中的所有文字如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等则输出NA。注意请仅输出提取出的文字不要输出别的任何内容。"}]}],
model=vl_model)
return llm_output
async def get_author_and_publish_date(text: str, model: str, test_mode: bool = False, _logger: logger = None) -> tuple[str, str]:
if not text:
return "", ""
@ -19,245 +226,122 @@ async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]:
if len(text) > 2048:
text = f'{text[:2048]}......'
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
suffix = '''Please output the extracted information in the following format(output only the result, no other content):
"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
content = f'<text>\n{text}\n</text>\n\n{suffix}'
llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
model=model, max_tokens=50, temperature=0.1)
content = f'<text>\n{text}\n</text>\n\n{get_ap_suffix}'
llm_output = await llm([{'role': 'system', 'content': get_ap_system}, {'role': 'user', 'content': content}],
model=model, max_tokens=50, temperature=0.1)
if test_mode:
print(f"llm output:\n {llm_output}")
ap_ = llm_output.strip().strip('"').strip('//')
if '//' not in ap_:
print(f"failed to parse from llm output: {ap_}")
if _logger:
_logger.warning(f"failed to parse from llm output: {ap_}")
return '', ''
ap = ap_.split('//')
return ap[0], extract_and_convert_dates(ap[1])
async def extract_info_from_img(task: list, vl_model: str) -> dict:
cache = {}
for url in task:
llm_output = await llm([{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
{"type": "text", "text": "提取图片中的所有文字如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等则输出NA。注意请仅输出提取出的文字不要输出别的任何内容。"}]}],
model=vl_model)
cache[url] = llm_output
return cache
class GeneralInfoExtractor:
def __init__(self, pb: PbTalker, _logger: logger) -> None:
self.pb = pb
self.logger = _logger
self.model = os.environ.get("PRIMARY_MODEL", "")
if not self.model:
self.logger.error("PRIMARY_MODEL not set, can't continue")
raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
# collect tags user set in pb database and determin the system prompt language based on tags
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
if not focus_data:
self.logger.info('no activated tag found, will ask user to create one')
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
'so please input one now. describe what info you care about shortly: ')
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
focus_data.append({"focuspoint": focus, "explanation": explanation,
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
# self.focus_list = [item["focuspoint"] for item in focus_data]
self.focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
focus_statement = ''
for item in focus_data:
tag = item["focuspoint"]
expl = item["explanation"]
focus_statement = f"{focus_statement}//{tag}//\n"
if expl:
if is_chinese(expl):
focus_statement = f"{focus_statement}解释:{expl}\n"
else:
focus_statement = f"{focus_statement}Explanation: {expl}\n"
if is_chinese(focus_statement):
self.get_info_prompt = f'''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
{focus_statement}\n
在提炼摘要时请遵循以下原则
- 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围
- 摘要应当详实充分使用简体中文如果原文是英文请翻译成简体中文
- 摘要信息务必忠于原文'''
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
//关注点1//
摘要1
//关注点2//
摘要2
//关注点3//
NA
...
"""'''
self.get_more_link_prompt = f'''你将被给到数行格式为"<编号>//内容//"的文本,你的任务是逐条分析这些文本,并分别与如下关注点之一相关联。关注点列表及其解释如下:
{focus_statement}\n
在进行关联分析时请遵循以下原则
- 理解每个关注点的含义
- 如果关注点有进一步的解释确保提取的内容符合这些解释的范围'''
self.get_more_link_suffix = '''请分行逐条输出结果,每一条的输出格式为"<编号>//关注点名称//",如果某条内容不与任何关注点相关,请输出"<编号>//NA//"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
<t1>//关注点1名称//
<t2>//关注点2名称//
<t3>//NA//
...
"""'''
else:
self.get_info_prompt = f'''You will be given a webpage text wrapped in <text></text> tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows:
{focus_statement}\n
When extracting summaries, please follow these principles:
- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
- The summary should be detailed and comprehensive
- The summary should be faithful to the original text'''
self.get_info_suffix = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
"""
//Focus Point 1//
Summary 1
//Focus Point 2//
Summary 2
//Focus Point 3//
NA
...
"""'''
self.get_more_link_prompt = f'''You will be given several lines of text in the format "<index>//content//". Your task is to analyze each line and associate it with one of the following focus points. The list of focus points and their explanations are as follows:
{focus_statement}\n
When performing the association analysis, please follow these principles:
- Understand the meaning of each focus point
- If a focus point has further explanation, ensure the extracted content aligns with the scope of these explanations'''
self.get_more_link_suffix = '''Please output the results line by line. Each line should be in the format "<index>//focus point name//". If a line is not related to any focus point, output "<index>//NA//". The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
"""
<t1>//Focus Point 1//
<t2>//Focus Point 2//
<t3>//NA//
...
"""'''
async def _generate_results(self, lines: list, mode: str) -> set:
if mode == 'get_info':
system_prompt = self.get_info_prompt
suffix = self.get_info_suffix
batch_size = 5000
elif mode == 'get_link':
system_prompt = self.get_more_link_prompt
suffix = self.get_more_link_suffix
batch_size = 2048
else:
self.logger.error(f"unknown mode: {mode}")
return set()
cache = set()
batches = []
text_batch = ''
for line in lines:
text_batch += f'{line}\n'
if len(text_batch) > batch_size:
content = f'<text>\n{text_batch}</text>\n\n{suffix}'
batches.append({'system_prompt': system_prompt, 'content': content})
text_batch = ''
if text_batch:
async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list[str], test_mode: bool = False,
_logger: logger = None) -> set:
sys_prompt, suffix, model = prompts
text_batch = ''
cache = set()
while texts:
t = texts.pop(0)
text_batch = f'{text_batch}{t}\n\n'
if len(text_batch) > 2048 or len(texts) == 0:
content = f'<text>\n{text_batch}</text>\n\n{suffix}'
batches.append({'system_prompt': system_prompt, 'content': content})
result = await llm(
[{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
model=model, temperature=0.1)
self.logger.info(f"LLM tasks size: {len(batches)}")
tasks = [
llm(
[{'role': 'system', 'content': batch['system_prompt']}, {'role': 'user', 'content': batch['content']}],
model=self.model, temperature=0.1
)
for batch in batches]
results = await asyncio.gather(*tasks)
for res in results:
if res:
extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL)
if extracted_result:
cache.add(extracted_result[-1])
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
if test_mode:
print(f"llm output:\n {result}")
if result:
links = re.findall(r'\[\d+\]', result[-1])
for link in links:
if link not in text_batch:
if _logger:
_logger.warning(f"model generating hallucination:\n{result[-1]}")
if test_mode:
print(f"model hallucination:\n{result[-1]}")
continue
cache.add(link)
text_batch = ''
return cache
more_urls = set()
for mark in cache:
url = link_dict[mark]
has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
if has_common_ext or has_common_tld:
continue
more_urls.add(url)
return more_urls
async def get_more_related_urls(self, link_dict: dict) -> set:
_to_be_processed = []
link_map = {}
for i, (url, des) in enumerate(link_dict.items()):
des = des.replace('\n', ' ')
_to_be_processed.append(f'<t{i+1}>//{des}//')
link_map[f'<t{i+1}'] = url
async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_dict: dict, author: str, publish_date: str,
test_mode: bool = False, _logger: logger = None) -> list[dict]:
raw_result = await self._generate_results(_to_be_processed, 'get_link')
final_result = set()
for result in raw_result:
for item in result.split('\n'):
if not item:
continue
segs = item.split('>')
if len(segs) != 2:
self.logger.debug(f"bad generate result: {item}")
continue
_index, focus = segs
_index = _index.strip()
focus = focus.strip().strip('//')
if focus == 'NA':
continue
if focus not in self.focus_dict or _index not in link_map:
self.logger.debug(f"bad generate result: {item}")
continue
# self.logger.debug(f"{link_map[_index]} selected")
final_result.add(link_map[_index])
return final_result
sys_prompt, suffix, model = prompts
async def get_info(self, text: str, text_links: dict, info_pre_fix: str) -> list[dict]:
raw_result = await self._generate_results(text.split('\n'), 'get_info')
final = []
for item in raw_result:
self.logger.debug(f"llm output:\n{item}")
segs = item.split('//')
i = 0
while i < len(segs) - 1:
focus = segs[i].strip()
if not focus:
i += 1
continue
if focus not in self.focus_dict:
self.logger.debug(f"bad generate result: {item}")
i += 1
continue
content = segs[i+1].strip().strip('摘要').strip(':').strip('')
i += 2
if not content or content == 'NA':
continue
"""
maybe can use embedding retrieval to judge
"""
if test_mode:
info_pre_fix = ''
else:
info_pre_fix = f"//{author} {publish_date}//"
url_tags = re.findall(r'\[(Ref_\d+)]', content)
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
cache = set()
batches = []
text_batch = ''
while texts:
t = texts.pop(0)
text_batch = f'{text_batch}{t}# '
if len(text_batch) > 9999 or len(texts) == 0:
content = f'<text>\n{text_batch}</text>\n\n{suffix}'
batches.append(content)
text_batch = ''
final.append({'tag': self.focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
tasks = [
llm([{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], model=model, temperature=0.1)
for content in batches]
results = await asyncio.gather(*tasks)
for res in results:
if test_mode:
print(f"llm output:\n {res}")
extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL)
if extracted_result:
cache.add(extracted_result[-1])
final = []
for item in cache:
segs = item.split('//')
i = 0
while i < len(segs) - 1:
focus = segs[i].strip()
if not focus:
i += 1
continue
if focus not in focus_dict:
if _logger:
_logger.info(f"llm hallucination: {item}")
if test_mode:
print(f"llm hallucination: {item}")
i += 1
continue
content = segs[i+1].strip().strip('摘要').strip(':').strip('')
i += 2
if not content or content == 'NA':
continue
"""
maybe can use embedding retrieval to judge
"""
url_tags = re.findall(r'\[\d+\]', content)
refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
return final
async def __call__(self, link_dict: dict, text: str, text_links: dict, author: str, publish_date: str) -> tuple[set, list]:
info_prefix = f"//{author} {publish_date}//"
return await self.get_more_related_urls(link_dict), await self.get_info(text, text_links, info_prefix)
return final

View File

@ -0,0 +1,74 @@
get_link_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,你的任务是从前到后仔细阅读文本,提取出与如下任一关注点相关的原文片段。关注点及其解释如下:
{focus_statement}\n
在进行提取时请遵循以下原则
- 理解关注点的含义以及进一步的解释如有确保提取的内容与关注点强相关并符合解释如有的范围
- 在满足上面原则的前提下提取出全部可能相关的片段
- 提取出的原文片段务必保留类似"[3]"这样的引用标记后续的处理需要用到这些引用标记'''
get_link_suffix = '''请逐条输出提取的原文片段,并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
如下是输出格式示例
"""
原文片段1
原文片段2
...
"""'''
get_link_system_en = '''You will be given a webpage text wrapped in <text></text> tags. Your task is to carefully read the text from beginning to end, extracting fragments related to any of the following focus points. The focus points and their explanations are as follows:
{focus_statement}\n
When extracting fragments, please follow these principles:
- Understand the meaning of each focus point and its explanation (if any), ensure the extracted content strongly relates to the focus point and aligns with the explanation (if any)
- Extract all possible related fragments
- Ensure the extracted fragments retain the reference markers like "[3]", as these will be used in subsequent processing'''
get_link_suffix_en = '''Please output each extracted fragment one by one, and wrap the entire output in triple quotes. The triple quotes should contain only the extracted fragments, with no other content. If the text does not contain any content related to the focus points, keep the triple quotes empty.
Here is an example of the output format:
"""
Fragment 1
Fragment 2
...
"""'''
get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
{focus_statement}\n
在提炼摘要时请遵循以下原则
- 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围
- 摘要应当详实充分使用简体中文如果原文是英文请翻译成简体中文
- 摘要信息务必忠于原文'''
get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
//关注点1//
摘要1
//关注点2//
摘要2
//关注点3//
NA
...
"""'''
get_info_system_en = '''You will be given a webpage text wrapped in <text></text> tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows:
{focus_statement}\n
When extracting summaries, please follow these principles:
- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
- The summary should be detailed and comprehensive
- The summary should be faithful to the original text'''
get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
"""
//Focus Point 1//
Summary 1
//Focus Point 2//
Summary 2
//Focus Point 3//
NA
...
"""'''
get_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
get_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content):
"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''

View File

@ -1,11 +1,10 @@
# -*- coding: utf-8 -*-
from utils.pb_api import PbTalker
from utils.general_utils import get_logger, extract_and_convert_dates
from utils.deep_scraper import *
from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
from agents.get_info import *
import json
import asyncio
from custom_fetchings import *
from scrapers import *
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, CacheMode
from datetime import datetime, timedelta
@ -19,18 +18,14 @@ if project_dir:
wiseflow_logger = get_logger('general_process', project_dir)
pb = PbTalker(wiseflow_logger)
gie = GeneralInfoExtractor(pb, wiseflow_logger)
one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
llm_model = os.environ.get("PRIMARY_MODEL", "")
vl_model = os.environ.get("VL_MODEL", "")
if not vl_model:
wiseflow_logger.warning("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
img_to_be_recognized_pattern = r'§to_be_recognized_by_visual_llm_(.*?)§'
recognized_img_cache = {}
crawler = AsyncWebCrawler(verbose=False)
model = os.environ.get("PRIMARY_MODEL", "")
if not model:
raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
secondary_model = os.environ.get("SECONDARY_MODEL", model)
async def save_to_pb(url: str, url_title: str, infos: list):
# saving to pb process
@ -46,112 +41,142 @@ async def save_to_pb(url: str, url_title: str, infos: list):
async def main_process(_sites: set | list):
# collect tags user set in pb database and determin the system prompt language based on tags
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
if not focus_data:
wiseflow_logger.info('no activated tag found, will ask user to create one')
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
'so please input one now. describe what info you care about shortly: ')
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ')
focus_data.append({"focuspoint": focus, "explanation": explanation,
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
focus_statement = ''
for item in focus_data:
tag = item["focuspoint"]
expl = item["explanation"]
focus_statement = f"{focus_statement}//{tag}//\n"
if expl:
if is_chinese(expl):
focus_statement = f"{focus_statement}解释:{expl}\n"
else:
focus_statement = f"{focus_statement}Explanation: {expl}\n"
date_stamp = datetime.now().strftime('%Y-%m-%d')
if is_chinese(focus_statement):
get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"今天的日期是{date_stamp}{get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix
get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"今天的日期是{date_stamp}{get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix
else:
get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix_en
get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix_en
recognized_img_cache = {}
working_list = set()
working_list.update(_sites)
async with AsyncWebCrawler(headless=True, verbose=False) as crawler:
while working_list:
url = working_list.pop()
existing_urls.add(url)
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
if has_common_ext:
wiseflow_logger.info(f'{url} is a common file, skip')
continue
await crawler.start()
while working_list:
url = working_list.pop()
existing_urls.add(url)
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
if has_common_ext:
wiseflow_logger.info(f'{url} is a common file, skip')
continue
parsed_url = urlparse(url)
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
domain = parsed_url.netloc
if domain in custom_scrapers:
wiseflow_logger.debug(f'{url} is a custom scraper, use custom scraper')
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
else:
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
magic=True, scan_full_page=True,
cache_mode=crawl4ai_cache_mode)
if not result.success:
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
continue
parsed_url = urlparse(url)
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
domain = parsed_url.netloc
if domain in custom_fetching_configs:
wiseflow_logger.debug(f'{url} will using custom crawl4ai run config')
run_config = custom_fetching_configs[domain]
else:
run_config = crawler_config
run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
result = await crawler.arun(url=url, config=run_config)
if not result.success:
wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
continue
metadata_dict = result.metadata if result.metadata else {}
raw_markdown = result.markdown
if not raw_markdown:
wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
continue
metadata_dict = result.metadata if result.metadata else {}
media_dict = result.media if result.media else {}
if domain in custom_scrapers:
result = custom_scrapers[domain](result)
raw_markdown = result.content
used_img = result.images
title = result.title
base_url = result.base
author = result.author
publish_date = result.publish_date
else:
raw_markdown = result.markdown
media_dict = result.media if result.media else {}
used_img = [d['src'] for d in media_dict.get('images', [])]
title = ''
base_url = ''
author = ''
publish_date = ''
web_title = metadata_dict.get('title', '')
if not raw_markdown:
wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
continue
if not title:
title = metadata_dict.get('title', '')
if not base_url:
base_url = metadata_dict.get('base', '')
if not base_url:
base_url = url
if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
# 如果路径不以 / 结尾,则去掉最后一个路径段
base_url = base_url.rsplit('/', 1)[0] + '/'
if not author:
author = metadata_dict.get('author', '')
publish_date = extract_and_convert_dates(metadata_dict.get('publish_date', ''))
if not publish_date:
publish_date = metadata_dict.get('publish_date', '')
img_dict = media_dict.get('images', [])
if not img_dict or not isinstance(img_dict, list):
used_img = []
else:
used_img = [d['src'] for d in img_dict]
link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
link_dict, (text, reference_map) = deep_scraper(raw_markdown, base_url, used_img)
_duplicate_url = set(link_dict.keys()) & existing_urls
for _d in _duplicate_url:
del link_dict[_d]
if link_dict and links_parts:
prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
links_texts = []
for _parts in links_parts:
links_texts.extend(_parts.split('\n\n'))
more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
if more_url:
working_list.update(more_url - existing_urls)
if not contents:
continue
to_be_replaces = {}
for u, des in link_dict.items():
matches = re.findall(img_to_be_recognized_pattern, des)
if matches:
for img_url in matches:
if img_url in recognized_img_cache:
link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[img_url])
continue
link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', img_url)
if img_url in to_be_replaces:
to_be_replaces[img_url].append(u)
else:
to_be_replaces[img_url] = [u]
matches = re.findall(img_to_be_recognized_pattern, text)
if matches:
for img_url in matches:
if f'h{img_url}' in recognized_img_cache:
text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[f'h{img_url}'])
continue
text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', f'h{img_url}')
img_url = f'h{img_url}'
if img_url in to_be_replaces:
to_be_replaces[img_url].append("content")
else:
to_be_replaces[img_url] = ["content"]
if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger)
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
for img_url, content in recognized_result.items():
for u in to_be_replaces[img_url]:
if u == "content":
text = text.replace(img_url, content)
else:
link_dict[u] = link_dict[u].replace(img_url, content)
if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
author, publish_date = await get_author_and_publish_date(text, llm_model)
wiseflow_logger.debug(f'get author and publish date by llm: {author}, {publish_date}')
if not author or author.lower() == 'na':
author = parsed_url.netloc
if not publish_date:
publish_date = datetime.now().strftime('%Y-%m-%d')
more_urls, infos = await gie(link_dict, text, reference_map, author, publish_date)
wiseflow_logger.debug(f'get {len(more_urls)} more urls and {len(infos)} infos')
if more_urls:
working_list.update(more_urls - existing_urls)
if infos:
await save_to_pb(url, web_title, infos)
if not author or author.lower() == 'na':
author = parsed_url.netloc
if publish_date:
publish_date = extract_and_convert_dates(publish_date)
else:
publish_date = date_stamp
prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
if infos:
await save_to_pb(url, title, infos)
await crawler.close()
if __name__ == '__main__':
sites = pb.read('sites', filter='activated=True')
wiseflow_logger.info('execute all sites one time')
asyncio.run(main_process([site['url'] for site in sites]))
asyncio.run(main_process([site['url'].rstrip('/') for site in sites]))

View File

@ -1,6 +1,6 @@
import os
from openai import AsyncOpenAI as OpenAI
from openai import RateLimitError
# from openai import RateLimitError
import asyncio
base_url = os.environ.get('LLM_API_BASE', "")
@ -30,7 +30,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
try:
response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
resp = response.choices[0].message.content
except RateLimitError as e:
except Exception as e:
if logger:
logger.warning(f'{e}\nRetrying in 60 second...')
else:
@ -44,13 +44,6 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
logger.error(f'after many try, llm error: {response}')
else:
print(f'after many try, llm error: {response}')
except Exception as e:
if logger:
logger.error(f'openai_llm error: {e}')
else:
print(f'openai_llm error: {e}')
finally:
semaphore.release()

View File

@ -1,10 +1,10 @@
## 配置自定义 Crawl4ai 抓取 config
# 配置自定义 Crawl4ai 抓取 config
如果信源需要对应特殊的抓取配置,可以在 `core/scrapers/__init__.py` 中编辑对应的 crawler_config并在 `custom_fetching_configs` 中注册。
## 解析器Scraper
# 解析器Scraper
对于从网页内容中提取关注信息这一任务而言,直接把 html 编码送给 llm 并不是一个好主意。在该类型任务中,我们期待 llm 表现的类似人类,侧重点在于内容的理解,而不是 html 的解析。且不说直接送入 html 编码还会造成额外(非常大量)的 token 消耗和处理效率的降低。
对于从网页内容中提取关注信息这一任务而言,直接把 html 编码送给 llm 并不是一个好主意,这会极大的增加提取任务的复杂度,引入更多干扰,并且产生额外(非常大量)的 token 消耗和处理效率的降低。
将 html 转为易于意思理解的 markdown 是目前领域内比较通用的做法,这方面 Crawl4ai 提供了比较成熟的解决方案。
@ -12,50 +12,24 @@
简单的说,解析器的作用就是将 html 编码转为 markdown 文本,并在这个过程中尽量过滤不必要信息(因为后一步是通过 llm 进行提炼,所以这一步要求不高),但也尽可能的保留 html 版面布局信息(这很重要)。
### deep_scraper
你并不需要通过解析器完成最终的信息提取,这个工作最终还是会使用 llm 完成——甚至在这之前我们还有一个被称为pre-process的步骤它的主要功能是将待处理的文章 markdown 合理切块并将 url 和图片等进行合理的转化,事实上,这个模块是本项目的一大创新点——解析器只需要提供适合 pre-process 的 markdown(我们称为 raw_markdown)和有价值的图片列表即可。
我们进一步发现,直接将 markdown 全文送入 llm 解析也存在缺陷。
## 自定义解析器
我在这里仅举一个例子
scraper 输入的 fetch_result 为一个 dict 或者是 crawl4ai 的 CrawlResult 对象,它包含如下字段
*很多网站喜欢在文章页面底部或者侧边栏加入推荐阅读板块,如果说这些推荐阅读只是链接列表还好,但事实上,很多时候他们还包括内容简介,这些简介的长度并不短,甚至有可能跟页面主体正文长度相当。这个时候如果我们将 markdown 整体扔给 llm就会发现很难为llm 指定明确的工作策略——如果直接舍弃这些推荐阅读内容(先不说很难指定清晰的舍弃策略),但我们不能保证这里面不包含关注点内容;而如果保留这些内容,那么很可能 llm 就无法聚焦该页面的核心内容。或者 llm 会从这些简介中进行信息提取,但是这些简介对应额外的链接,这些后续的链接也会在后面进行爬取,这就可能带来提取出大量重复信息的情况。*
- url: str, 网页的 url
- html: str, 网页的 html 编码
- cleaned_html: str, 经过清洗的 html 编码
- markdown: str, 经过清洗的 markdown 编码
- media: dict, 包含图片、视频、音频等媒体信息
- metadata: dict, 包含网页的元数据,如标题、作者、发布时间等
事实上,这里我们需要做的工作是分块,这有点类似 RAG 系统中的 chunk ,但不同的是,这里我们不需要考虑 chunk 的粒度,而是需要考虑页面布局的粒度。因为我们面对的是 html 页面,而不是 pdf、word……
scraper 的输出为 ScraperResultData具体见 `core/scrapers/scraper_data.py`
这一点很重要,我们需要按 html 的页面布局进行分块,而不是按语义逻辑分块!因为这影响了后续我们如何判断对不同的块采用合同提取策略。这也就是 wiseflow 为何不使用已有的文档智能工具,而是自写了 deep_scraper 的原因。
## 注册自定义解析器
当然,另一个选择是直接使用视觉大模型进行 layout 的识别,但实践中我们也发现,这需要能够获取不受干扰的网页截图,但这个操作会极大增加系统复杂度以及降低处理速度,且效果并不稳定(比如对于页面弹窗的处理……)。
另一个不使用文档智能和视觉大模型的原因,是因为相比于 pdf、word 这种完全的非结构数据, html 编码本身就已经包含了全部 layout 信息,转化为 markdown 的过程实际上也保留了这些信息(通过\n # 这些符号),所以直接通过一定的规则对 markdown 进行分块并分别处理是可行的。
这就是 wiseflow deep_scraper 的主要功能归纳起来1、按布局信息对markdown进行分块2、分析每个块的类型并按不同策略进行预处理便于最终 llm 的提取。
### 注册自定义解析器
wiseflow 的默认工作流程是:
*crawl4ai 获取 html并初步转化为raw_markdown此过程应用默认的 config --> deep_scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。*
如前所言,如果需要为特定信源配置特殊的 crawl4ai 获取策略(包括 raw_markdown 的转化策略),可以在 `core/scrapers/__init__.py` 中注册自定义的crawler_config
同时也可以为特定信源配置自定义的 scraper自定义 scraper 的输入为crawl4ai的fetching_result输出为将要被送入 llm 进行分析的链接字典和文本块列表。使用自定义 scraper 时wiseflow 的处理流程为:
*crawl4ai 获取 html并初步转化为raw_markdown此过程应用默认的 config或指定 config --> 自定义 scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。*
自定义 scraper 可以内部调用deep_scraper作为后处理流程如mp_scraper也可以完全自定义全部流程。
scraper 输入的 fetch_result 为一个 dict格式如下
输出为 ScraperResultData包含 url、content、links、images 四个字段。
`core/scrapers/__init__.py` 中注册,参考:
编写好 scraper 后,在 `core/scrapers/__init__.py` 中注册,参考:
```python
from .mp import mp_scarper

View File

@ -1,15 +1,43 @@
## Custom Scraper Registration
# Configure Custom Crawl4ai Fetching Config
Register in `core/scrapers/__init__.py`, for example:
If a source requires special fetching configuration, you can edit the corresponding crawler_config in `core/scrapers/__init__.py` and register it in `custom_fetching_configs`.
# Scraper
For the task of extracting focused information from web content, directly feeding HTML code to LLM is not a good idea. This would greatly increase the complexity of extraction, introduce more interference, and result in additional (very large) token consumption and reduced processing efficiency.
Converting HTML to markdown that is easy to understand semantically is a common practice in the field, and Crawl4ai provides a relatively mature solution for this.
However, this refers to general cases. There is no one-size-fits-all solution. For certain specific sources, Crawl4ai's default parser may not work well, such as WeChat public account articles. In these cases, we need to customize scrapers for the sources.
Simply put, the scraper's role is to convert HTML code to markdown text, filtering out unnecessary information during this process (since the next step is refinement through LLM, requirements here are not high), while preserving HTML layout information as much as possible (this is important).
You don't need to complete the final information extraction through the scraper. This work will ultimately be done using LLM - in fact, before that we have a step called pre-process, whose main function is to reasonably segment the article markdown and properly transform URLs and images. In fact, this module is a major innovation point of this project - the scraper only needs to provide raw_markdown suitable for pre-process and a list of valuable images.
## Custom Scraper
The fetch_result input to the scraper is either a dict or a Crawl4ai CrawlResult object containing the following fields:
- url: str, the webpage URL
- html: str, the webpage HTML code
- cleaned_html: str, cleaned HTML code
- markdown: str, cleaned markdown code
- media: dict, contains media information like images, videos, audio etc.
- metadata: dict, contains webpage metadata like title, author, publish time etc.
The scraper output is ScraperResultData, see details in `core/scrapers/scraper_data.py`.
## Register Custom Scraper
After writing the scraper, register it in `core/scrapers/__init__.py`, for example:
```python
from .mp import mp_scarper
from .mp import mp_scraper
customer_scrapers = {'mp.weixin.qq.com': mp_scarper}
custom_scrapers = {'mp.weixin.qq.com': mp_scraper}
```
Note that the key should use the domain name, which can be obtained using `urllib.parse`:
Note that the key uses the domain name, which can be obtained using `urllib.parse`:
```python
from urllib.parse import urlparse

View File

@ -1,7 +1,8 @@
from crawl4ai import CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from .mp_scraper import mp_scraper
custom_scrapers = {}
custom_scrapers = {'mp.weixin.qq.com': mp_scraper}
custom_fetching_configs = {}
md_generator = DefaultMarkdownGenerator(

View File

@ -1,225 +0,0 @@
# -*- coding: utf-8 -*-
# This program requires HTML to be first converted to properly formatted text while preserving link positions and structural information (like crawl4ai's html2text work);
# The complete media list from the webpage needs to be extracted beforehand
# Currently this script only handles images and links, other elements like downloads and videos are not processed yet, todo: process according to media list
# action_dict needs to be extracted from raw html, which is not covered by this script
import re
from urllib.parse import urljoin
common_file_exts = [
'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
]
common_tlds = [
'.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
'.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
'.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
'.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
]
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
def normalize_url(url: str, base_url: str) -> str:
url = url.strip()
if url.startswith(('www.', 'WWW.')):
_url = f"https://{url}"
elif url.startswith('/www.'):
_url = f"https:/{url}"
elif url.startswith("//"):
_url = f"https:{url}"
elif url.startswith(('http://', 'https://')):
_url = url
elif url.startswith('http:/'):
_url = f"http://{url[6:]}"
elif url.startswith('https:/'):
_url = f"https://{url[7:]}"
else:
_url = urljoin(base_url, url)
_ss = _url.split('//')
if len(_ss) == 2:
return '//'.join(_ss)
else:
return _ss[0] + '//' + '/'.join(_ss[1:])
def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]:
link_dict = {}
to_be_recognized_by_visual_llm = {}
# for special url formate from crawl4ai 0.4.247
raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
# 处理图片标记 ![alt](src)
i_pattern = r'(!\[(.*?)\]\((.*?)\))'
matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
for _sec, alt, src in matches:
# 替换为新格式 §alt||src§
raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
def check_url_text(text) -> tuple[int, str]:
score = 0
_valid_len = len(text.strip())
# 找到所有[part0](part1)格式的片段
link_pattern = r'(\[(.*?)\]\((.*?)\))'
matches = re.findall(link_pattern, text, re.DOTALL)
for _sec, link_text, link_url in matches:
# 处理 \"***\" 格式的片段
quote_pattern = r'\"(.*?)\"'
# 提取所有引号包裹的内容
_title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
# 分离§§内的内容和后面的内容
img_marker_pattern = r'§(.*?)\|\|(.*?)§'
inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
for alt, src in inner_matches:
link_text = link_text.replace(f'§{alt}||{src}§', '')
link_text = link_text.strip()
if _title not in link_text:
link_text = f"{_title} - {link_text}"
link_text = link_text.strip()
if not link_text and inner_matches:
img_alt = inner_matches[0][0].strip()
img_src = inner_matches[0][1].strip()
if img_src and not img_src.startswith('#'):
img_src = normalize_url(img_src, base_url)
if not img_src:
link_text = img_alt
elif len(img_alt) > 2:
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = img_alt + _key
else:
if img_src not in to_be_recognized_by_visual_llm:
to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§"
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
link_text = to_be_recognized_by_visual_llm[img_src] + _key
else:
link_text = img_alt
real_url_pattern = r'<(.*?)>'
real_url = re.search(real_url_pattern, link_url, re.DOTALL)
if real_url:
_url = real_url.group(1).strip()
else:
_url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
if not _url or _url.startswith(('#', 'javascript:')):
text = text.replace(_sec, link_text, 1)
continue
score += 1
_valid_len = _valid_len - len(_sec)
url = normalize_url(_url, base_url)
_key = f"[{len(link_dict)+1}]"
link_dict[_key] = url
text = text.replace(_sec, link_text + _key, 1)
# 检查链接是否是常见文件类型或顶级域名
# todo: 最后提取是否添加到 more_link时或者主流程时再处理
"""
has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
if has_common_ext or has_common_tld:
continue
"""
# 处理文本中的其他图片标记
img_pattern = r'(§(.*?)\|\|(.*?)§)'
matches = re.findall(img_pattern, text, re.DOTALL)
remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
remained_text_len = len(remained_text)
for _sec, alt, src in matches:
if not src or src.startswith('#') or src not in used_img:
text = text.replace(_sec, alt, 1)
continue
img_src = normalize_url(src, base_url)
if not img_src:
text = text.replace(_sec, alt, 1)
elif remained_text_len > 5 or len(alt) > 2:
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, alt + _key, 1)
else:
if img_src not in to_be_recognized_by_visual_llm:
to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§"
_key = f"[img{len(link_dict)+1}]"
link_dict[_key] = img_src
text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1)
# 处理文本中的"野 url"
url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
matches = re.findall(url_pattern, text)
for url in matches:
url = normalize_url(url, base_url)
_key = f"[{len(link_dict)+1}]"
link_dict[_key] = url
text = text.replace(url, _key, 1)
score += 1
_valid_len = _valid_len - len(url)
# 统计换行符数量
newline_count = text.count(' * ')
score += newline_count
ratio = _valid_len/score if score != 0 else 999
return ratio, text
sections = raw_markdown.split('# ') # use '# ' to avoid # in url
if len(sections) > 2:
_sec = sections[0]
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
section_remain_len = len(section_remain)
total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
ratio = total_links / section_remain_len if section_remain_len != 0 else 1
if ratio > 0.05:
print('this is a navigation section, will be removed')
print(ratio)
print(section_remain)
print('-' * 50)
sections = sections[1:]
_sec = sections[-1]
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
section_remain_len = len(section_remain)
if section_remain_len < 198:
print('this is a footer section, will be removed')
print(section_remain_len)
print(section_remain)
print('-' * 50)
sections = sections[:-1]
links_parts = []
contents = []
for section in sections:
ratio, text = check_url_text(section)
if ratio < 70:
print('this is a links part')
print(ratio)
print(text)
print('-' * 50)
links_parts.append(text)
else:
print('this is a content part')
print(ratio)
print(text)
print('-' * 50)
contents.append(text)
return link_dict, links_parts, contents

View File

@ -1,10 +1,34 @@
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
import os
import re
# import jieba
from loguru import logger
def normalize_url(url: str, base_url: str) -> str:
url = url.strip()
if url.startswith(('www.', 'WWW.')):
_url = f"https://{url}"
elif url.startswith('/www.'):
_url = f"https:/{url}"
elif url.startswith("//"):
_url = f"https:{url}"
elif url.startswith(('http://', 'https://')):
_url = url
elif url.startswith('http:/'):
_url = f"http://{url[6:]}"
elif url.startswith('https:/'):
_url = f"https://{url[7:]}"
else:
_url = urljoin(base_url, url)
_ss = _url.split('//')
if len(_ss) == 2:
return '//'.join(_ss)
else:
return _ss[0] + '//' + '/'.join(_ss[1:])
def isURL(string):
if string.startswith("www."):
string = f"https://{string}"

View File

@ -1,8 +1,9 @@
export LLM_API_KEY=""
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
#If your source pages are relatively simple with small amounts of information per page, considering cost and time (mainly time), Qwen2.5-32B-Instruct is recommended
#If your source pages contain more links, have complex layouts, and you don't want to miss any information, DeepSeek-V2.5 is recommended
export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
#use a secondary model to excute the filtering task for the cost saving
#if not set, will use the primary model to excute the filtering task
export VL_MODEL="OpenGVLab/InternVL2-26B"
export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password

View File

@ -1,92 +1,90 @@
# -*- coding: utf-8 -*-
import os, re, sys
import os, sys
import json
import asyncio
import time
from prompts import *
from datetime import datetime
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir) # get parent dir
sys.path.append(project_root)
# 将core目录添加到Python路径
core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core')
sys.path.append(core_path)
from core.llms.openai_wrapper import openai_llm as llm
# 现在可以直接导入模块因为core目录已经在Python路径中
from scrapers import *
from agents.get_info import pre_process
from utils.general_utils import is_chinese
from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls
from agents.get_info_prompts import *
benchmark_model = 'Qwen/Qwen2.5-72B-Instruct'
models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'internlm/internlm2_5-20b-chat']
async def main(texts: list[str], link_dict: dict, record_file: str, sys_prompt: str, focus_points: list):
# first get more links
print(f'sys_prompt: \n{sys_prompt}')
benchmark_result = None
models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']
async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str):
link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents']
get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts
for model in [benchmark_model] + models:
_texts = []
for text in texts:
_texts.extend(text.split('\n\n'))
links_texts = []
for _parts in links_parts:
links_texts.extend(_parts.split('\n\n'))
contents = sample['contents'].copy()
print(f"running {model} ...")
start_time = time.time()
hallucination_times = 0
text_batch = ''
cache = set()
while _texts:
t = _texts.pop(0)
text_batch = f'{text_batch}{t}\n\n'
if len(text_batch) > 512 or len(_texts) == 0:
content = f'<text>\n{text_batch}</text>\n\n{get_info_suffix}'
result = await llm(
[{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
model=model, temperature=0.1)
print(f"llm output\n{result}\n")
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
if result:
# 在result[-1]中找到所有类似[4]这样的片段
links = re.findall(r'\[\d+\]', result[-1])
for link in links:
if link not in text_batch:
hallucination_times += 1
print(f'\n**not in text_batch: {link}**\n')
continue
cache.add(link)
text_batch = ''
t1 = time.time()
get_infos_time = t1 - start_time
print(f"get more infos time: {get_infos_time}")
print("*" * 12)
print('\n\n')
for link in cache:
if link not in link_dict:
print(f'\n**not in link_dict: {link}**\n')
if model == benchmark_model:
benchmark_result = cache.copy()
diff = 'benchmark'
if include_ap:
author, publish_date = await get_author_and_publish_date(contents[0], model, test_mode=True)
get_ap_time = time.time() - start_time
print(f"get author and publish date time: {get_ap_time}")
else:
# 计算当前cache与benchmark的差异
missing_in_cache = len(benchmark_result - cache) # benchmark中有但cache中没有的
extra_in_cache = len(cache - benchmark_result) # cache中有但benchmark中没有的
author, publish_date = '', ''
get_ap_time = 0
start_time = time.time()
more_url = await get_more_related_urls(links_texts, link_dict, [get_link_sys_prompt, get_link_suffix_prompt, model], test_mode=True)
get_more_url_time = time.time() - start_time
print(f"get more related urls time: {get_more_url_time}")
start_time = time.time()
infos = await get_info(contents, link_dict, [get_info_sys_prompt, get_info_suffix_prompt, model], focus_dict, author, publish_date, test_mode=True)
get_info_time = time.time() - start_time
print(f"get info time: {get_info_time}")
if model == benchmark_model:
benchmark_result = more_url.copy()
diff = f'benchmark: {len(benchmark_result)} results'
else:
missing_in_cache = len(benchmark_result - more_url) # benchmark中有但cache中没有的
extra_in_cache = len(more_url - benchmark_result) # cache中有但benchmark中没有的
total_diff = missing_in_cache + extra_in_cache
diff = f'差异{total_diff}个(遗漏{missing_in_cache}个,多出{extra_in_cache}个)'
infos_to_record = '\n'.join(list(set(link_dict[link] for link in cache)))
related_urls_to_record = '\n'.join(more_url)
infos_to_record = [f"{fi['tag']}: {fi['content']}" for fi in infos]
infos_to_record = '\n'.join(infos_to_record)
with open(record_file, 'a') as f:
f.write(f"llm model: {model}\n")
f.write(f"process time: {get_infos_time} s\n")
f.write(f"bad generate times: {hallucination_times}\n")
f.write(f"model: {model}\n")
if include_ap:
f.write(f"get author and publish date time: {get_ap_time}\n")
f.write(f"author: {author}\n")
f.write(f"publish date: {publish_date}\n")
f.write(f"get more related urls time: {get_more_url_time}\n")
f.write(f"diff from benchmark: {diff}\n")
f.write(f"segments: \n{infos_to_record}\n")
f.write("*" * 12)
f.write(f"get info time: {get_info_time}\n")
f.write(f"related urls: \n{related_urls_to_record}\n")
f.write(f"final result: \n{infos_to_record}\n")
f.write('\n\n')
print('\n\n')
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--sample_dir', '-D', type=str, default='')
parser.add_argument('--include_ap', '-I', type=bool, default=False)
args = parser.parse_args()
sample_dir = args.sample_dir
include_ap = args.include_ap
if not os.path.exists(os.path.join(sample_dir, 'focus_point.json')):
raise ValueError(f'{sample_dir} focus_point.json not found')
@ -97,27 +95,43 @@ if __name__ == '__main__':
expl = item["explanation"]
focus_statement = f"{focus_statement}//{tag}//\n"
if expl:
focus_statement = f"{focus_statement}解释:{expl}\n"
if is_chinese(expl):
focus_statement = f"{focus_statement}解释:{expl}\n"
else:
focus_statement = f"{focus_statement}Explanation: {expl}\n"
focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
date_stamp = datetime.now().strftime('%Y-%m-%d')
if is_chinese(focus_statement):
get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"今天的日期是{date_stamp}{get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix
get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"今天的日期是{date_stamp}{get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix
else:
get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
get_link_suffix_prompt = get_link_suffix_en
get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
get_info_suffix_prompt = get_info_suffix_en
get_info_system = get_info_system.replace('{focus_statement}', focus_statement)
system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')}{get_info_system}"
focus_points = [item["focuspoint"] for item in focus_points]
prompts = [get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt]
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
record_file = os.path.join(sample_dir, f'record-{time_stamp}.txt')
with open(record_file, 'w') as f:
f.write(f"focus statement: \n{focus_statement}\n\n")
for dirs in os.listdir(sample_dir):
if not os.path.isdir(os.path.join(sample_dir, dirs)):
for file in os.listdir(sample_dir):
if not file.endswith('_processed.json'):
continue
_path = os.path.join(sample_dir, dirs)
print(f'start testing {_path}')
if 'sample.json' not in os.listdir(_path):
print(f'{dirs} sample.json not found, skip')
sample = json.load(open(os.path.join(sample_dir, file), 'r'))
if 'links_part' not in sample or 'link_dict' not in sample or 'contents' not in sample:
print(f'{file} not valid sample, skip')
continue
sample = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
with open(record_file, 'a') as f:
f.write(f"raw materials in: {dirs}\n\n")
asyncio.run(main(sample['links_part'], sample['link_dict'], record_file, system_prompt, focus_points))
f.write(f"raw materials: {file}\n\n")
print(f'start testing {file}')
asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file))

View File

@ -2,14 +2,16 @@ import os
import sys
import re
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir) # 获取父目录
sys.path.append(project_root)
# 将core目录添加到Python路径
core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core')
sys.path.append(core_path)
from core.scrapers.deep_scraper import deep_scraper, common_chars
from core.scrapers.mp_scraper import mp_scraper
# 现在可以直接导入模块因为core目录已经在Python路径中
from scrapers import *
from agents.get_info import pre_process
def check_url_text(text):
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
print(f"processing: {text}")
left_bracket = text.find('[')
right_paren = text.rfind(')')
@ -56,25 +58,75 @@ def check_url_text(text):
for match in matches:
print(match)
async def main(html_sample, record_file):
recognized_img_cache = {}
parsed_url = urlparse(html_sample['url'])
domain = parsed_url.netloc
if domain in custom_scrapers:
result = custom_scrapers[domain](html_sample)
raw_markdown = result.content
used_img = result.images
title = result.title
base_url = result.base
author = result.author
publish_date = result.publish_date
else:
raw_markdown = html_sample['markdown']
media_dict = html_sample['media'] if html_sample['media'] else {}
used_img = [d['src'] for d in media_dict.get('images', [])]
title = ''
base_url = ''
author = ''
publish_date = ''
if not raw_markdown:
print(f"no raw_markdown for {file}")
return
if not title:
title = html_sample.get('title', '')
if not base_url:
base_url = html_sample.get('base', '')
if not base_url:
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
base_url = base_url.rsplit('/', 1)[0] + '/'
if not author:
author = html_sample.get('author', '')
if not publish_date:
publish_date = html_sample.get('publish_date', '')
link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, test_mode=True)
result = {
"link_dict": link_dict,
"links_part": links_parts,
"contents": contents,
}
with open(record_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"pre process done, saved to {record_file}")
if __name__ == '__main__':
import argparse
import time
import json
from urllib.parse import urlparse
import asyncio
parser = argparse.ArgumentParser()
parser.add_argument('--test_file', '-F', type=str, default='')
parser.add_argument('--sample_dir', '-D', type=str, default='')
parser.add_argument('--test_string', '-T', type=str, default='')
parser.add_argument('--record_folder', '-R', type=str, default='')
args = parser.parse_args()
if args.test_string:
check_url_text(args.test_string)
exit()
test_file = args.test_file
sample_dir = args.sample_dir
record_folder = args.record_folder
if record_folder:
os.makedirs(record_folder, exist_ok=True)
files = []
if test_file:
files.append(test_file)
@ -84,43 +136,9 @@ if __name__ == '__main__':
for file in files:
if not file.endswith('.json'): continue
print(f"processing {file} ...")
try:
with open(file, 'r') as f:
html_sample = json.load(f)
_url = html_sample['url']
if _url.startswith('https://mp.weixin.qq.com'):
result = mp_scraper(html_sample)
raw_markdown = result.content
used_img = result.images
else:
raw_markdown = html_sample['markdown']
used_img = [d['src'] for d in html_sample['media']['images']]
except Exception as e:
print('sample format error, try to use craw4ai_fething.py to get sample')
print(f"error: {e}")
continue
parsed_url = urlparse(_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
# 如果路径不以 / 结尾,则去掉最后一个路径段
base_url = base_url.rsplit('/', 1)[0] + '/'
time_start = time.time()
link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img)
time_end = time.time()
#print(f"time cost for html: {time_end - time_start}s")
result = {
"link_dict": link_dict,
"links_part": links_part,
"contents": contents,
}
record_folder = file.replace('.json', '')
os.makedirs(record_folder, exist_ok=True)
with open(os.path.join(record_folder, 'sample.json'), 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
#print("done")
#print("*" * 12)
with open(file, 'r') as f:
html_sample = json.load(f)
record_file = os.path.join(record_folder, f'{os.path.basename(file)}_processed.json')
asyncio.run(main(html_sample, record_file))

View File

@ -1,94 +0,0 @@
get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,你的任务是从前到后仔细阅读文本,提取出与如下任一关注点相关的原文片段。关注点及其解释如下:
{focus_statement}\n
在进行提取时请遵循以下原则
- 理解关注点的含义以及进一步的解释如有确保提取的内容与关注点强相关并符合解释如有的范围
- 在满足上面原则的前提下提取出全部可能相关的片段
- 提取出的原文片段务必保留类似"[3]"这样的引用标记后续的处理需要用到这些引用标记'''
get_info_suffix = '''请逐条输出提取的原文片段,并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
如下是输出格式示例
"""
原文片段1
原文片段2
...
"""'''
text_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
{focus_statement}\n
在提炼摘要时请遵循以下原则
- 理解每个关注点的含义以及进一步的解释如有确保摘要与关注点强相关并符合解释如有的范围
- 摘要应当详实充分
- 摘要信息务必忠于原文'''
text_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
//关注点1//
摘要1
//关注点2//
摘要2
//关注点3//
NA
...
"""'''
text_link_system = '''你将被给到数行格式为"<编号>//内容//"的文本,你的任务是逐条分析这些文本,并分别与如下关注点之一相关联。关注点列表及其解释如下:
{focus_statement}\n
在进行关联分析时请遵循以下原则
- 理解每个关注点的含义
- 如果关注点有进一步的解释确保提取的内容符合这些解释的范围'''
text_link_suffix = '''请分行逐条输出结果,每一条的输出格式为"<编号>//关注点名称//",如果某条内容不与任何关注点相关,请输出"<编号>//NA//"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
"""
<t1>//关注点1名称//
<t2>//关注点2名称//
<t3>//NA//
...
"""'''
text_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
text_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content):
"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
verified_system = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹,网页文本则用<text></text>包裹。请遵循如下工作流程:
1尝试找出网页文本中所有与信息对应的文本片段可能有多处
2基于这些片段给出是否相符的最终结论最终结论仅为'''
verified_suffix = '先输出找到的所有文本片段,再输出最终结论(仅为是或否)'
image_info_system = '''作为信息提取助手,你的任务是从给定的网页截屏中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下:
{focus_statement}\n
在进行信息提取时请遵循以下原则
- 理解每个兴趣点的含义确保提取的内容与之相关
- 如果兴趣点有进一步的解释确保提取的内容符合这些解释的范围
- 忠于原文你的任务是从网页截屏中识别和提取与各个兴趣点相关的信息并不是总结和提炼'''
image_info_suffix = '''如果网页截屏中包含兴趣点相关的内容请按照以下json格式输出提取的信息文本中可能包含多条有用信息请不要遗漏
[{"focus": 兴趣点名称, "content": 提取的内容}]
示例
[{"focus": "旅游景点", "content": "北京故宫地址北京市东城区景山前街4号开放时间8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
如果截屏中不包含任何与兴趣点相关的信息或者你判断这是一个文章列表页面请仅输出[]'''
image_link_system = "作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的网页截屏中挑选出最值得关注的链接推荐给用户进一步点击查看。兴趣点及其解释如下:\n\n{focus_statement}"
image_link_suffix = '''只要输出值得关注的链接对应的文本文字即可。按一行一条的格式输出,最终输出的列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
"""
链接文字1
链接文字2
...
"""'''
image_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage screenshot. If the screenshot does not contain a particular piece of information, please replace it with NA"
image_ap_suffix = '''Please output the extracted information in the following JSON format:
{"source": source or article author (use "NA" if this information cannot be found), "publish_date": publication date (keep only the year, month, and day; use "NA" if this information cannot be found)}'''
image_system = "提取图片中的所有文字如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等则输出NA。注意请仅输出提取出的文字不要输出别的任何内容。"
image_system_en = "Extract all text from the image. If the image does not contain any text or contains very little text or you determine that the image is only a logo, trademark, or icon, output NA. Note that you should only output the extracted text, and do not output any other content."