mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
v0.3.6
This commit is contained in:
parent
8a6d1ed7da
commit
ae7b5d7f65
@ -9,12 +9,59 @@ from urllib.parse import urlparse
|
||||
import json_repair
|
||||
|
||||
|
||||
list_judge_threshold = 0.007
|
||||
valid_list_min_length = 10
|
||||
min_content_length = 420
|
||||
|
||||
common_file_exts = [
|
||||
'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
|
||||
'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
|
||||
'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
|
||||
'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
|
||||
'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
|
||||
]
|
||||
common_tlds = [
|
||||
'.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
|
||||
'.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
|
||||
'.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
|
||||
'.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
|
||||
]
|
||||
|
||||
def find_article_or_list(link_dict: dict, text: str) -> (bool, bool, dict, str):
|
||||
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
||||
text = '\n'.join(lines)
|
||||
for key, value in link_dict.items():
|
||||
link_dict[key] = value.lower()
|
||||
|
||||
text_no_tags = re.sub(r'<\w{1,5}>', '', text)
|
||||
text_no_urls = re.sub(r'\[url\d+]', '', text_no_tags)
|
||||
content_length = len(text_no_urls)
|
||||
|
||||
valid_url = set()
|
||||
for url in link_dict.values():
|
||||
has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
|
||||
has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
|
||||
if not has_common_ext and not has_common_tld:
|
||||
valid_url.add(url)
|
||||
|
||||
valid_url_rate = len(valid_url) / content_length
|
||||
is_list = valid_url_rate > list_judge_threshold and len(valid_url) > valid_list_min_length
|
||||
need_more_info = content_length < min_content_length
|
||||
return is_list, need_more_info, link_dict, text
|
||||
|
||||
|
||||
class GeneralInfoExtractor:
|
||||
def __init__(self, pb: PbTalker, _logger: logger) -> None:
|
||||
self.pb = pb
|
||||
self.logger = _logger
|
||||
self.model = os.environ.get("PRIMARY_MODEL", "Qwen/Qwen2.5-7B-Instruct") # better to use "Qwen/Qwen2.5-14B-Instruct"
|
||||
self.secondary_model = os.environ.get("SECONDARY_MODEL", 'Qwen/Qwen2.5-7B-Instruct') # better to use ''
|
||||
self.model = os.environ.get("PRIMARY_MODEL", "")
|
||||
self.secondary_model = os.environ.get("SECONDARY_MODEL", "")
|
||||
|
||||
if not self.model or not self.secondary_model:
|
||||
self.logger.error("PRIMARY_MODEL or SECONDARY_MODEL not set, can't continue")
|
||||
raise ValueError("PRIMARY_MODEL or SECONDARY_MODEL not set, please set it in environment variables or edit core/.env")
|
||||
|
||||
self.vl_model = os.environ.get("VL_MODEL", "")
|
||||
|
||||
# collect tags user set in pb database and determin the system prompt language based on tags
|
||||
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
|
||||
@ -37,59 +84,103 @@ class GeneralInfoExtractor:
|
||||
focus_statement = f"{focus_statement}解释:{expl}\n"
|
||||
|
||||
if is_chinese(focus_statement):
|
||||
self.get_info_prompt = f'''作为信息提取助手,你的任务是从给定的网页文本中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下:
|
||||
self.get_info_prompt = f'''作为信息提取助手,你的任务是从给定的网页文本中抽取任何与下列关注点之一相关的信息。关注点列表及其解释如下:
|
||||
|
||||
{focus_statement}\n
|
||||
在进行信息提取时,请遵循以下原则:
|
||||
|
||||
- 理解每个兴趣点的含义,确保提取的内容与之相关。
|
||||
- 如果兴趣点有进一步的解释,确保提取的内容符合这些解释的范围。
|
||||
- 忠于原文,你的任务是从网页文本中识别和提取与各个兴趣点相关的信息,并不是总结和提炼。
|
||||
- 理解每个关注点的含义,确保提取的内容至少与其中之一相关
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围
|
||||
- 忠于原文,你的任务是从网页文本中抽取相关信息,而不是提炼、总结和改写
|
||||
- 对于最终输出的信息,请保证主体、时间、地点等关键要素的清晰明确,为此可能需要综合上下文进行提取
|
||||
- 如果提取的内容中包括类似“<mp4>”、“[url1]”这样的片段,务必原样保留'''
|
||||
|
||||
另外请注意给定的网页文本是通过爬虫程序从html代码中提取出来的,所以请忽略里面不必要的空格、换行符等。'''
|
||||
self.get_info_suffix = '''如果上述网页文本中包含兴趣点相关的内容,请按照以下json格式输出提取的信息(文本中可能包含多条有用信息,请不要遗漏):
|
||||
[{"focus": 兴趣点名称, "content": 提取的内容}]
|
||||
self.get_info_suffix = '''请先复述一遍关注点及其解释,再对原文进行分析。如果网页文本中包含关注点相关的内容,请按照以下json格式输出提取的信息:
|
||||
{"focus": 关注点名称, "content": 提取的内容}
|
||||
|
||||
示例:
|
||||
[{"focus": "旅游景点", "content": "北京故宫,地址:北京市东城区景山前街4号,开放时间:8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
|
||||
|
||||
如果网页文本中不包含任何与兴趣点相关的信息,请仅输出:[]。'''
|
||||
self.get_more_link_prompt = f"作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的文本及其对应的URL中挑选出最值得关注的URL。兴趣点及其解释如下:\n\n{focus_statement}"
|
||||
self.get_more_link_suffix = '''请逐条分析,先逐一给出分析依据,最终将挑选出的 url 按一行一条的格式输出,最终输出的 url 列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
|
||||
如果有多条相关信息,请按一行一条的格式输出,最终输出的结果整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
|
||||
"""
|
||||
url1
|
||||
url2
|
||||
{"focus": 关注点1名称, "content": 提取内容1}
|
||||
{"focus": 关注点2名称, "content": 提取内容2}
|
||||
...
|
||||
"""'''
|
||||
else:
|
||||
self.get_info_prompt = f'''As an information extraction assistant, your task is to extract content related to the following user focus points from the given web page text. The list of focus points and their explanations is as follows:
|
||||
"""
|
||||
|
||||
如果网页文本中不包含任何相关的信息,请保证三引号内为空。'''
|
||||
|
||||
self.get_more_link_prompt = f'''你将被给到一段处理过的网页文本,在这些文本中所有的url链接都已经被替换为类似"[url120]"这样的标签,并置于与其关联的文本后面。
|
||||
你的任务是从网页文本中抽取任何与下列关注点之一相关的文本片段。关注点列表及其解释如下:
|
||||
|
||||
{focus_statement}\n
|
||||
When extracting information, please follow the principles below:
|
||||
在进行抽取时,请遵循以下原则:
|
||||
|
||||
- Understand the meaning of each focus point and ensure that the extracted content is relevant to it.
|
||||
- If a focus point has further explanations, ensure that the extracted content conforms to the scope of these explanations.
|
||||
- Stay true to the original text; your task is to identify and extract information related to each focus point from the web page text, not to summarize or refine it.
|
||||
- 理解每个关注点的含义,确保提取的内容至少与其中之一相关
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围
|
||||
- 只抽取以标签(类似"[url120]"这样)结尾的文本片段
|
||||
- 维持抽取出的文本片段的原样,尤其不要遗漏其后的标签'''
|
||||
|
||||
Please note that the given web page text is extracted from HTML code via a crawler, so please ignore any unnecessary spaces, line breaks, etc.'''
|
||||
self.get_info_suffix = '''If the above webpage text contains content related to points of interest, please output the extracted information in the following JSON format (the text may contain multiple useful pieces of information, do not miss any):
|
||||
[{"focus": "Point of Interest Name", "content": "Extracted Content"}]
|
||||
|
||||
Example:
|
||||
[{"focus": "Tourist Attraction", "content": "The Forbidden City, Beijing, Address: No. 4 Jingshan Front Street, Dongcheng District, Opening Hours: 8:30-17:00"}, {"focus": "Food Recommendation", "content": "Must-try at Wangfujing Snack Street: Beijing Roast Duck, Noodles with Soybean Paste"}]
|
||||
|
||||
If the webpage text does not contain any information related to points of interest, please output only: []'''
|
||||
self.get_more_link_prompt = f"As an efficient information filtering assistant, your task is to select the most noteworthy URLs from a set of texts and their corresponding URLs based on the given focus points. The focus points and their explanations are as follows:\n\n{focus_statement}"
|
||||
self.get_more_link_suffix = '''Please analyze one by one, first give the analysis basis one by one, and finally output the selected URLs in a row-by-row format. The final output URL list is wrapped in three quotes as a whole, and there should be no other content in the three quotes. Here is an example of the output format:
|
||||
self.get_more_link_suffix = '''请先复述一遍关注点及其解释,再对原文逐行进行抽取,最终将挑选出的文本片段按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
|
||||
"""
|
||||
url1
|
||||
url2
|
||||
文本1
|
||||
文本2
|
||||
...
|
||||
"""'''
|
||||
|
||||
self.info_judge_prompt = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹,网页文本则用<text></text>包裹。请遵循如下工作流程:
|
||||
1、尝试找出网页文本中所有与信息相关的片段(有多少找多少,没有的话则跳过);
|
||||
2、判断信息是否与这些片段在关键要素上一致,请特别注意主语、日期、地点以及数字这些。'''
|
||||
|
||||
self.info_judge_suffix = '先输出找到的所有文本片段,再输出最终结论(仅为“是”或“否”)'
|
||||
else:
|
||||
self.get_info_prompt = f'''As an information extraction assistant, your task is to extract any information from the given webpage text that relates to at least one of the following focus points. The list of focus points and their explanations are as follows:
|
||||
|
||||
{focus_statement}\n
|
||||
When extracting information, please follow these principles:
|
||||
|
||||
- Understand the meaning of each focus point and ensure the extracted content relates to at least one of them
|
||||
- If a focus point has further explanations, ensure the extracted content aligns with those explanations
|
||||
- Stay faithful to the original text - your task is to extract relevant information, not to refine, summarize or rewrite
|
||||
- For the final output, ensure key elements like subject, time, location etc. are clearly specified, which may require synthesizing context
|
||||
- If the extracted content includes fragments like "<mp4>" or "[url1]", make sure to preserve them exactly as they appear'''
|
||||
|
||||
self.get_info_suffix = '''First, please restate the focus points and their explanations, then analyze the original text. If the webpage text contains content related to the focus points, please output the extracted information in the following JSON format:
|
||||
{"focus": focus point name, "content": extracted content}
|
||||
|
||||
If there are multiple relevant pieces of information, output them one per line, with the entire output wrapped in triple quotes. There should be no other content within the triple quotes. Here is an example of the output format:
|
||||
"""
|
||||
{"focus": focus point 1 name, "content": extracted content 1}
|
||||
{"focus": focus point 2 name, "content": extracted content 2}
|
||||
...
|
||||
"""
|
||||
|
||||
If the webpage text does not contain any relevant information, ensure the content within the triple quotes is empty.'''
|
||||
|
||||
self.get_more_link_prompt = f'''You will be given a processed webpage text where all URL links have been replaced with tags like "[url120]" and placed after their associated text.
|
||||
Your task is to extract any text fragments from the webpage text that relate to any of the following focus points. The list of focus points and their explanations are as follows:
|
||||
|
||||
{focus_statement}\n
|
||||
When extracting, please follow these principles:
|
||||
|
||||
- Understand the meaning of each focus point and ensure the extracted content relates to at least one of them
|
||||
- If a focus point has further explanations, ensure the extracted content aligns with those explanations
|
||||
- Only extract text fragments that end with tags (like "[url120]")
|
||||
- Maintain the text fragments exactly as they appear, especially don't omit their trailing tags'''
|
||||
|
||||
self.get_more_link_suffix = '''First, please restate the focus points and their explanations, then analyze the original text line by line. Finally, output the selected text fragments one per line, with the entire output wrapped in triple quotes. There should be no other content within the triple quotes. Here is an example of the output format:
|
||||
"""
|
||||
text1
|
||||
text2
|
||||
...
|
||||
"""'''
|
||||
|
||||
self.info_judge_prompt = '''Determine whether the given information matches the webpage text. The information will be wrapped in <info></info> tags, and the webpage text will be wrapped in <text></text> tags. Please follow this workflow:
|
||||
1. Try to find all text fragments in the webpage text that are related to the information (find as many as possible, skip if none);
|
||||
2. Determine whether the information is consistent with these fragments in key elements, paying special attention to subjects, dates, locations, and numbers.'''
|
||||
|
||||
self.info_judge_suffix = 'First, output all found text fragments, then output the final conclusion (only "Y" or "N").'
|
||||
|
||||
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
|
||||
if not text:
|
||||
return "NA", "NA"
|
||||
return "", ""
|
||||
|
||||
if len(text) > 1024:
|
||||
text = f'{text[:500]}......{text[-500:]}'
|
||||
@ -106,7 +197,7 @@ url2
|
||||
if not llm_output:
|
||||
return '', ''
|
||||
result = json_repair.repair_json(llm_output, return_objects=True)
|
||||
self.logger.debug(f"decoded_object: {result}")
|
||||
|
||||
if not isinstance(result, dict):
|
||||
self.logger.warning("failed to parse from llm output")
|
||||
return '', ''
|
||||
@ -116,111 +207,188 @@ url2
|
||||
|
||||
return result['source'], extract_and_convert_dates(result['publish_date'])
|
||||
|
||||
async def get_more_related_urls(self, link_dict: dict, og_url: str) -> set[str]:
|
||||
if not link_dict:
|
||||
async def _generate_results(self, text: str, mode: str) -> set:
|
||||
if mode == 'get_info':
|
||||
system_prompt = self.get_info_prompt
|
||||
suffix = self.get_info_suffix
|
||||
batch_size = 2048
|
||||
elif mode == 'get_link':
|
||||
system_prompt = self.get_more_link_prompt
|
||||
suffix = self.get_more_link_suffix
|
||||
batch_size = 1024
|
||||
else:
|
||||
self.logger.error(f"unknown mode: {mode}")
|
||||
return set()
|
||||
self.logger.debug(f'{len(link_dict)} items to analyze')
|
||||
urls = set()
|
||||
content = ''
|
||||
for key, value in link_dict.items():
|
||||
content = f"{content}{key}: {value}\n"
|
||||
if len(content) > 512:
|
||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||
|
||||
lines = text.split('\n')
|
||||
cache = set()
|
||||
text_batch = ''
|
||||
for line in lines:
|
||||
text_batch = f'{text_batch}\n{line}'
|
||||
if len(text_batch) > batch_size:
|
||||
content = f'<text>\n{text_batch}\n</text>\n\n{suffix}'
|
||||
result = await llm(
|
||||
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
if result:
|
||||
result = result[0].strip()
|
||||
# self.logger.debug(f"cleaned output: {result}")
|
||||
urls.update(extract_urls(result))
|
||||
content = ''
|
||||
if not result:
|
||||
self.logger.warning("bad generate result")
|
||||
text_batch = ''
|
||||
continue
|
||||
for item in result:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
item = item.split('\n')
|
||||
cache.update(item)
|
||||
text_batch = ''
|
||||
|
||||
if content:
|
||||
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
|
||||
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
|
||||
if text_batch:
|
||||
content = f'<text>\n{text_batch}\n</text>\n\n{suffix}'
|
||||
result = await llm(
|
||||
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
if result:
|
||||
result = result[0].strip()
|
||||
# self.logger.debug(f"cleaned output: {result}")
|
||||
urls.update(extract_urls(result))
|
||||
|
||||
raw_urls = set(link_dict.values())
|
||||
urls.discard(og_url)
|
||||
hallucination_urls = urls - raw_urls
|
||||
if hallucination_urls:
|
||||
self.logger.warning(f"{hallucination_urls} not in link_dict, it's model's Hallucination")
|
||||
|
||||
return urls & raw_urls
|
||||
|
||||
async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]:
|
||||
if not text:
|
||||
return []
|
||||
|
||||
content = f'<text>\n{text}\n</text>\n\n{self.get_info_suffix}'
|
||||
result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1, response_format={"type": "json_object"})
|
||||
self.logger.debug(f'get_info llm output:\n{result}')
|
||||
if not result:
|
||||
return []
|
||||
|
||||
result = json_repair.repair_json(result, return_objects=True)
|
||||
if not isinstance(result, list):
|
||||
self.logger.warning("failed to parse from llm output")
|
||||
return []
|
||||
if not result:
|
||||
self.logger.debug("no info found")
|
||||
return []
|
||||
|
||||
system = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹,网页文本则用<text></text>包裹。请遵循如下工作流程:
|
||||
1、尝试找出网页文本中所有与信息对应的文本片段(可能有多处);
|
||||
2、基于这些片段给出是否相符的最终结论,最终结论仅为“是”或“否”'''
|
||||
suffix = '先输出找到的所有文本片段,再输出最终结论(仅为是或否)'
|
||||
|
||||
final = []
|
||||
for item in result:
|
||||
if 'focus' not in item or 'content' not in item:
|
||||
self.logger.warning(f"not quality item: {item}, it's model's Hallucination")
|
||||
if not result:
|
||||
self.logger.warning("bad generate result")
|
||||
return cache
|
||||
for item in result:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
item = item.split('\n')
|
||||
cache.update(item)
|
||||
return cache
|
||||
|
||||
async def _extract_info_from_img(self, text, link_dict) -> str:
|
||||
cache = {}
|
||||
pattern = r'<img>\[url\d+\]'
|
||||
matches = re.findall(pattern, text)
|
||||
for match in matches:
|
||||
key = match.split('[url')[1][:-1]
|
||||
url = link_dict.get(f'url{key}', '')
|
||||
if not url:
|
||||
continue
|
||||
if item['focus'] not in self.focus_dict:
|
||||
self.logger.warning(f"{item['focus']} not in focus_list, it's model's Hallucination")
|
||||
|
||||
if url in cache:
|
||||
replace_text = cache[url]
|
||||
else:
|
||||
if any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds):
|
||||
continue
|
||||
if any(url.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
|
||||
continue
|
||||
llm_output = await llm([{"role": "user",
|
||||
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
|
||||
{"type": "text", "text": "提取图片中的所有文字,如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等,则输出NA。注意请仅输出提取出的文字,不要输出别的任何内容。"}]}],
|
||||
model=self.vl_model)
|
||||
self.logger.debug(f"vl model output: \n{llm_output}\n")
|
||||
replace_text = llm_output
|
||||
cache[url] = replace_text
|
||||
text = text.replace(match, f'{replace_text}{match}', 1)
|
||||
return text
|
||||
|
||||
async def get_more_related_urls(self, link_dict: dict, text: str) -> list[str]:
|
||||
raw_result = await self._generate_results(text, 'get_link')
|
||||
final_result = set()
|
||||
for item in raw_result:
|
||||
if '[url' not in item:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
url_tags = re.findall(r'\[url\d+]', item)
|
||||
if not url_tags:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
if url_tag not in link_dict:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
result_url = link_dict[url_tag]
|
||||
if any(result_url.endswith(tld) or result_url.endswith(tld + '/') for tld in common_tlds):
|
||||
continue
|
||||
if any(result_url.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
|
||||
continue
|
||||
final_result.add(result_url)
|
||||
return list(final_result)
|
||||
|
||||
async def get_info(self, link_dict: dict, text: str, info_pre_fix: str) -> list[dict]:
|
||||
raw_result = await self._generate_results(text, 'get_info')
|
||||
final = []
|
||||
for item in raw_result:
|
||||
result = json_repair.repair_json(item, return_objects=True)
|
||||
if not isinstance(result, dict):
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not result:
|
||||
continue
|
||||
if 'focus' not in result or 'content' not in result:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not item['focus'] or item['focus'] not in self.focus_dict:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not item['content']:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
|
||||
if item['content'] in link_dict:
|
||||
self.logger.debug(f"{item['content']} in link_dict, aborting")
|
||||
continue
|
||||
|
||||
judge = await llm([{'role': 'system', 'content': system},
|
||||
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{suffix}'}],
|
||||
judge = await llm([{'role': 'system', 'content': self.info_judge_prompt},
|
||||
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{self.info_judge_suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
self.logger.debug(f'judge llm output:\n{judge}')
|
||||
if not judge:
|
||||
self.logger.warning("failed to parse from llm output, skip checking")
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{item['content']}"})
|
||||
self.logger.info(f"<info>\n{item['content']}\n</info>\n\n<text>\n{text}\n</text>")
|
||||
self.logger.info(judge)
|
||||
content = item['content']
|
||||
url_tags = re.findall(r'\[url\d+]', content)
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
_url = link_dict.get(url_tag, '')
|
||||
if _url:
|
||||
content = content.replace(url_tag, _url)
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{content}"})
|
||||
continue
|
||||
|
||||
to_save = False
|
||||
for i in range(min(7, len(judge))):
|
||||
char = judge[-1 - i]
|
||||
if char == '是':
|
||||
if char == '是' or char == 'Y':
|
||||
to_save = True
|
||||
break
|
||||
elif char == '否':
|
||||
elif char == '否' or char == 'N':
|
||||
break
|
||||
if not to_save:
|
||||
self.logger.info(f"secondary model judge {item} not faithful to article text, aborting")
|
||||
self.logger.warning("secondary model judge not faithful to article text, aborting")
|
||||
self.logger.info(f"<info>\n{item['content']}\n</info>\n\n<text>\n{text}\n</text>")
|
||||
self.logger.info(judge)
|
||||
continue
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{item['content']}"})
|
||||
|
||||
if not final:
|
||||
self.logger.info("no quality result from llm output")
|
||||
content = item['content']
|
||||
url_tags = re.findall(r'\[url\d+]', content)
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
_url = link_dict.get(url_tag, '')
|
||||
if _url:
|
||||
content = content.replace(url_tag, _url)
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{content}"})
|
||||
|
||||
return final
|
||||
|
||||
async def __call__(self, text: str, link_dict: dict, base_url: str, author: str = None, publish_date: str = None) -> tuple[list, set, str, str]:
|
||||
async def __call__(self, link_dict: dict, text: str, base_url: str, author: str = None, publish_date: str = None) -> tuple[bool, list]:
|
||||
is_list, need_more_info, link_dict, text = find_article_or_list(link_dict, text)
|
||||
if is_list:
|
||||
self.logger.info("may be a article list page, get more urls ...")
|
||||
return True, await self.get_more_related_urls(link_dict, text)
|
||||
|
||||
if need_more_info:
|
||||
self.logger.info("may be a article page need to get more text from images...")
|
||||
text = await self._extract_info_from_img(text, link_dict)
|
||||
self.logger.debug(f"extended text: \n{text}\n")
|
||||
|
||||
if not author and not publish_date and text:
|
||||
author, publish_date = await self.get_author_and_publish_date(text)
|
||||
|
||||
@ -230,20 +398,6 @@ url2
|
||||
if not publish_date or publish_date.lower() == 'na':
|
||||
publish_date = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
related_urls = await self.get_more_related_urls(link_dict, base_url)
|
||||
|
||||
info_prefix = f"//{author} {publish_date}//"
|
||||
lines = text.split('\n')
|
||||
text = ''
|
||||
infos = []
|
||||
for line in lines:
|
||||
text = f'{text}{line}'
|
||||
if len(text) > 2048:
|
||||
cache = await self.get_info(text, info_prefix, link_dict)
|
||||
infos.extend(cache)
|
||||
text = ''
|
||||
if text:
|
||||
cache = await self.get_info(text, info_prefix, link_dict)
|
||||
infos.extend(cache)
|
||||
|
||||
return infos, related_urls, author, publish_date
|
||||
return False, await self.get_info(link_dict, text, info_prefix)
|
||||
|
@ -1,5 +0,0 @@
|
||||
# future plan
|
||||
# inspired by https://github.com/OSU-NLP-Group/SeeAct
|
||||
# use a visual-llm to extract the main content and determine next action
|
||||
|
||||
# input a playwright page object
|
@ -1,83 +0,0 @@
|
||||
# wiseflow 自定义解析器说明
|
||||
|
||||
## 概述
|
||||
wiseflow 致力于通过一套通用流程(使用大模型驱动的可以自主使用爬虫工具的智能体)处理所有页面。
|
||||
|
||||
目前在页面获取方面我们使用流行的爬虫框架 Crawlee(playwright)进行统一管理,经过实测 Crawlee 在速度和兼容性方面都非常不错,且有着完善的任务队列管理模块,因此网页获取方面一般无需自定义。
|
||||
|
||||
对于页面信息的解析,wiseflow 默认使用大模型,但用户可以为特定域名配置自定义解析器。
|
||||
|
||||
## 自定义解析器配置说明
|
||||
|
||||
### 1. Scraper 函数定义
|
||||
Scraper 应该是一个函数(而不是类)。
|
||||
|
||||
### 2. 函数参数
|
||||
该函数接收两个入参(wiseflow 框架传入):
|
||||
- `html`:这是 wiseflow 通过 Crawlee 的 playwright_crawler 获取到的渲染后的页面 html 代码,类型为 `str`,scraper 可以直接使用 `bs` `parsel`等库进行解析;
|
||||
- `url`:当前页面的 url 地址,类型为 `str`(仅是为了特殊操作,用不到的话可以直接忽略)。
|
||||
|
||||
### 3. 函数返回值
|
||||
Scraper 出参限定为三个:
|
||||
|
||||
#### 3.1 `article`
|
||||
解析出的页面详情,类型为 `dict`,格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'author': ...,
|
||||
'publish_date': ...,
|
||||
'content': ...
|
||||
}
|
||||
```
|
||||
|
||||
- 上述值的类型都要求为 `str`,日期格式为 `YYYY-MM-DD`。
|
||||
|
||||
**注意:**
|
||||
1. `'content'` 要有且不为空,不然无法触发后续的提取;
|
||||
2. `'author'` 和 `'publish_date'` 尽量有,不然 wiseflow 会自动用域名对应 demain 和 当日日期代替。
|
||||
|
||||
#### 3.2 `links`
|
||||
对应页面解析出的链接,类型可以是 `set`,也可以是 `dict`:
|
||||
|
||||
- 如果是 `set`,则会全部被加入任务队列。
|
||||
- 如果是 `dict`,则会调用 llm 从中挑取值得加入任务队列的 url(根据你的 focus point),`dict` 的格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'text': 外链对应的文字信息,
|
||||
'url': 外链对应的 url
|
||||
}
|
||||
```
|
||||
|
||||
wiseflow 会以这个为输入,使用 llm 判断值得继续爬取的链接。
|
||||
|
||||
#### 3.3 `infos`
|
||||
对应页面抽取出的值得关注的信息列表,类型是 `list`,元素为 `dict`,格式为:
|
||||
|
||||
```python
|
||||
{
|
||||
'tag': focuspoint 的 id,
|
||||
'content': 具体 info 内容
|
||||
}
|
||||
```
|
||||
|
||||
**注意,focuspoint 的 id 要和 pb 中 focus_points 表一致**
|
||||
|
||||
### 4. 注册自定义解析器
|
||||
在 `core/custom_scraper/__init__.py` 中注册,参考:
|
||||
|
||||
```python
|
||||
from .mp import mp_scarper
|
||||
|
||||
customer_crawler_map = {'mp.weixin.qq.com': mp_scarper}
|
||||
```
|
||||
|
||||
注意键使用域名,可以使用 `urllib.parse` 获取:
|
||||
|
||||
```python
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed_url = urlparse("site's url")
|
||||
domain = parsed_url.netloc
|
||||
```
|
@ -1,83 +0,0 @@
|
||||
# wiseflow Custom Parser Instructions
|
||||
|
||||
## Overview
|
||||
wiseflow is committed to processing all pages through a universal process (an intelligent agent driven by large models that can autonomously use web scraping tools).
|
||||
|
||||
Currently, we use the popular web scraping framework Crawlee (playwright) for unified management in page acquisition. After practical testing, Crawlee performs well in terms of speed and compatibility, and has a robust task queue management module, so customizations are generally unnecessary for web page acquisition.
|
||||
|
||||
For page information parsing, wiseflow uses large models by default, but users can configure custom parsers for specific domains.
|
||||
|
||||
## Custom Parser Configuration Instructions
|
||||
|
||||
### 1. Scraper Function Definition
|
||||
The Scraper should be a function (not a class).
|
||||
|
||||
### 2. Function Parameters
|
||||
The function receives two input parameters (passed by the wiseflow framework):
|
||||
- `html`: This is the rendered page HTML code obtained by wiseflow through Crawlee's playwright_crawler, of type `str`. The scraper can directly use libraries like `bs` and `parsel` for parsing;
|
||||
- `url`: The URL address of the current page, of type `str` (only for special operations, can be ignored if not needed).
|
||||
|
||||
### 3. Function Return Values
|
||||
The Scraper output is limited to three:
|
||||
|
||||
#### 3.1 `article`
|
||||
The parsed page details, of type `dict`, with the following format:
|
||||
|
||||
```python
|
||||
{
|
||||
'author': ...,
|
||||
'publish_date': ...,
|
||||
'content': ...
|
||||
}
|
||||
```
|
||||
|
||||
- The types of the above values are all required to be `str`, with the date format being `YYYY-MM-DD`, and the screenshot being a **file path**, which can be a relative path to the core directory or an absolute path, with the file type being `png`.
|
||||
|
||||
**Note:**
|
||||
1. `'content'` must be present and not empty, otherwise subsequent extraction cannot be triggered;
|
||||
2. `'author'` and `'publish_date'` should be included if possible, otherwise wiseflow will automatically use the domain corresponding to the demain and the current date.
|
||||
|
||||
#### 3.2 `links`
|
||||
The links parsed from the corresponding page, the type can be `set` or `dict`:
|
||||
|
||||
- If it is a `set`, all will be added to the task queue.
|
||||
- If it is a `dict`, llm will be called to select URLs worth adding to the task queue (based on your focus point), with the format of the `dict` as follows:
|
||||
|
||||
```python
|
||||
{
|
||||
'text': text information corresponding to the external link,
|
||||
'url': url corresponding to the external link
|
||||
}
|
||||
```
|
||||
|
||||
wiseflow will use this as input to determine the links worth continuing to crawl using llm.
|
||||
|
||||
#### 3.3 `infos`
|
||||
The list of noteworthy information extracted from the corresponding page, of type `list`, with elements being `dict`, in the following format:
|
||||
|
||||
```python
|
||||
{
|
||||
'tag': id of the focuspoint,
|
||||
'content': specific info content
|
||||
}
|
||||
```
|
||||
|
||||
**Note that the id of the focuspoint must match the focus_points table in pb**
|
||||
|
||||
### 4. Register Custom Parser
|
||||
Register in `core/custom_scraper/__init__.py`, for reference:
|
||||
|
||||
```python
|
||||
from .mp import mp_scarper
|
||||
|
||||
customer_crawler_map = {'mp.weixin.qq.com': mp_scarper}
|
||||
```
|
||||
|
||||
Note that the key uses the domain name, which can be obtained using `urllib.parse`:
|
||||
|
||||
```python
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed_url = urlparse("site's url")
|
||||
domain = parsed_url.netloc
|
||||
```
|
@ -1,3 +0,0 @@
|
||||
from .mp import mp_scraper
|
||||
|
||||
custom_scraper_map = {'mp.weixin.qq.com': mp_scraper}
|
@ -1,107 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import os, re
|
||||
import logging
|
||||
|
||||
|
||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
if project_dir:
|
||||
os.makedirs(project_dir, exist_ok=True)
|
||||
|
||||
log_formatter = logging.Formatter(fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
# create logger and set level to debug
|
||||
logger = logging.getLogger('mp_scraper')
|
||||
logger.handlers = []
|
||||
logger.setLevel('DEBUG')
|
||||
logger.propagate = False
|
||||
|
||||
# create file handler and set level to debug
|
||||
file = os.path.join(project_dir, 'mp_scraper.log')
|
||||
file_handler = logging.FileHandler(file, 'a', encoding='utf-8')
|
||||
file_handler.setLevel('INFO')
|
||||
file_handler.setFormatter(log_formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# create console handler and set level to info
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel('DEBUG')
|
||||
console_handler.setFormatter(log_formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
async def mp_scraper(html: str, url: str) -> tuple[dict, set, list]:
|
||||
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
||||
logger.warning(f'{url} is not a mp url, you should not use this function')
|
||||
return {}, set(), []
|
||||
|
||||
url = url.replace("http://", "https://", 1)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
|
||||
# 文章目录
|
||||
urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
|
||||
simple_urls = set()
|
||||
for url in urls:
|
||||
cut_off_point = url.find('chksm=')
|
||||
if cut_off_point != -1:
|
||||
url = url[:cut_off_point - 1]
|
||||
simple_urls.add(url)
|
||||
return {}, simple_urls, []
|
||||
|
||||
# Get the original release date first
|
||||
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
||||
match = re.search(pattern, html)
|
||||
if match:
|
||||
publish_time = match.group(1)
|
||||
else:
|
||||
publish_time = datetime.strftime(datetime.today(), "%Y-%m-%d")
|
||||
|
||||
# Get description content from < meta > tag
|
||||
try:
|
||||
meta_description = soup.find('meta', attrs={'name': 'description'})
|
||||
summary = meta_description['content'].strip() if meta_description else ''
|
||||
# card_info = soup.find('div', id='img-content')
|
||||
# Parse the required content from the < div > tag
|
||||
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
|
||||
if soup.find('h1', id='activity-name') \
|
||||
else soup.find('h1', class_='rich_media_title').text.strip()
|
||||
profile_nickname = soup.find('div', class_='wx_follow_nickname').text.strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"not mp format: {url}\n{e}")
|
||||
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
||||
return {}, set(), []
|
||||
|
||||
if not rich_media_title or not profile_nickname:
|
||||
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
||||
return {}, set(), []
|
||||
|
||||
# Parse text and image links within the content interval
|
||||
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
|
||||
# (but the proportion of this type of article is not high).
|
||||
texts = []
|
||||
content_area = soup.find('div', id='js_content')
|
||||
if content_area:
|
||||
# 提取文本
|
||||
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
|
||||
text = section.get_text(separator=' ', strip=True)
|
||||
if text and text not in texts:
|
||||
texts.append(text)
|
||||
cleaned_texts = [t for t in texts if t.strip()]
|
||||
content = '\n'.join(cleaned_texts)
|
||||
else:
|
||||
logger.warning(f"failed to analysis contents {url}")
|
||||
return {}, set(), []
|
||||
if content:
|
||||
content = f"[from {profile_nickname}]{content}"
|
||||
else:
|
||||
# If the content does not have it, but the summary has it, it means that it is a mp of the picture sharing type.
|
||||
# At this time, you can use the summary as the content.
|
||||
content = f"[from {profile_nickname}]{summary}"
|
||||
|
||||
article = {'author': profile_nickname,
|
||||
'publish_date': publish_time,
|
||||
'content': content}
|
||||
|
||||
return article, set(), []
|
@ -1,14 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from utils.pb_api import PbTalker
|
||||
from utils.general_utils import get_logger, extract_and_convert_dates
|
||||
from utils.general_utils import get_logger
|
||||
from agents.get_info import GeneralInfoExtractor
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from custom_scraper import custom_scraper_map
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import hashlib
|
||||
from scrapers import *
|
||||
from urllib.parse import urlparse
|
||||
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
@ -22,13 +21,15 @@ screenshot_dir = os.path.join(project_dir, 'crawlee_storage', 'screenshots')
|
||||
wiseflow_logger = get_logger('general_process', project_dir)
|
||||
pb = PbTalker(wiseflow_logger)
|
||||
gie = GeneralInfoExtractor(pb, wiseflow_logger)
|
||||
existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'])}
|
||||
one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
|
||||
existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
|
||||
|
||||
|
||||
async def save_to_pb(url: str, infos: list):
|
||||
async def save_to_pb(url: str, url_title: str, infos: list):
|
||||
# saving to pb process
|
||||
for info in infos:
|
||||
info['url'] = url
|
||||
info['url_title'] = url_title
|
||||
_ = pb.add(collection_name='infos', body=info)
|
||||
if not _:
|
||||
wiseflow_logger.error('add info failed, writing to cache_file')
|
||||
@ -50,86 +51,52 @@ async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
|
||||
|
||||
@crawler.router.default_handler
|
||||
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
||||
# context.log.info(f'Processing {context.request.url} ...')
|
||||
await context.page.wait_for_load_state('networkidle')
|
||||
await context.page.wait_for_timeout(2000)
|
||||
# Handle dialogs (alerts, confirms, prompts)
|
||||
async def handle_dialog(dialog):
|
||||
context.log.info(f'Closing dialog: {dialog.message}')
|
||||
await dialog.accept()
|
||||
|
||||
context.page.on('dialog', handle_dialog)
|
||||
await context.page.wait_for_load_state('networkidle')
|
||||
html = await context.page.inner_html('body')
|
||||
|
||||
context.log.info('successfully finish fetching')
|
||||
|
||||
html = await context.page.inner_html('head')
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
web_title = soup.find('title')
|
||||
if web_title:
|
||||
web_title = web_title.get_text().strip()
|
||||
else:
|
||||
web_title = ''
|
||||
|
||||
parsed_url = urlparse(context.request.url)
|
||||
domain = parsed_url.netloc
|
||||
if domain in custom_scraper_map:
|
||||
context.log.info(f'routed to customer scraper for {domain}')
|
||||
try:
|
||||
article, more_urls, infos = await custom_scraper_map[domain](html, context.request.url)
|
||||
if not article and not infos and not more_urls:
|
||||
wiseflow_logger.warning(f'{parsed_url} handled by customer scraper, bot got nothing')
|
||||
except Exception as e:
|
||||
context.log.error(f'error occurred: {e}')
|
||||
wiseflow_logger.warning(f'handle {parsed_url} failed by customer scraper, so no info can be found')
|
||||
article, infos, more_urls = {}, [], set()
|
||||
|
||||
link_dict = more_urls if isinstance(more_urls, dict) else {}
|
||||
related_urls = more_urls if isinstance(more_urls, set) else set()
|
||||
if not infos and not related_urls:
|
||||
try:
|
||||
text = article.get('content', '')
|
||||
except Exception as e:
|
||||
wiseflow_logger.warning(f'customer scraper output article is not valid dict: {e}')
|
||||
text = ''
|
||||
|
||||
if not text:
|
||||
wiseflow_logger.warning(f'no content found in {parsed_url} by customer scraper, cannot use llm GIE, aborting')
|
||||
infos, related_urls = [], set()
|
||||
else:
|
||||
author = article.get('author', '')
|
||||
publish_date = article.get('publish_date', '')
|
||||
# get infos by llm
|
||||
try:
|
||||
infos, related_urls, author, publish_date = await gie(text, link_dict, context.request.url, author, publish_date)
|
||||
except Exception as e:
|
||||
wiseflow_logger.error(f'gie error occurred in processing: {e}')
|
||||
infos, related_urls = [], set()
|
||||
base_tag = soup.find('base', href=True)
|
||||
if base_tag and base_tag.get('href'):
|
||||
base_url = base_tag['href']
|
||||
else:
|
||||
# Extract data from the page.
|
||||
# future work: try to use a visual-llm do all the job...
|
||||
text = await context.page.inner_text('body')
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
links = soup.find_all('a', href=True)
|
||||
# if no base tag, use the current url as base url
|
||||
base_url = f"{parsed_url.scheme}://{domain}"
|
||||
link_dict = {}
|
||||
for a in links:
|
||||
new_url = a.get('href')
|
||||
if new_url.startswith('javascript:') or new_url.startswith('#') or new_url.startswith('mailto:'):
|
||||
continue
|
||||
if new_url in [context.request.url, base_url]:
|
||||
continue
|
||||
if new_url in existing_urls:
|
||||
continue
|
||||
t = a.text.strip()
|
||||
if new_url and t:
|
||||
link_dict[t] = urljoin(base_url, new_url)
|
||||
existing_urls.add(new_url)
|
||||
|
||||
publish_date = soup.find('div', class_='date').get_text(strip=True) if soup.find('div', class_='date') else None
|
||||
if publish_date:
|
||||
publish_date = extract_and_convert_dates(publish_date)
|
||||
author = soup.find('div', class_='author').get_text(strip=True) if soup.find('div', class_='author') else None
|
||||
if not author:
|
||||
author = soup.find('div', class_='source').get_text(strip=True) if soup.find('div', class_='source') else None
|
||||
# get infos by llm
|
||||
infos, related_urls, author, publish_date = await gie(text, link_dict, base_url, author, publish_date)
|
||||
html = await context.page.inner_html('body')
|
||||
if domain in custom_scrapers:
|
||||
action_dict, link_dict, text = custom_scrapers[domain](html, base_url)
|
||||
else:
|
||||
action_dict, link_dict, text = general_scraper(html, base_url)
|
||||
|
||||
if infos:
|
||||
await save_to_pb(context.request.url, infos)
|
||||
is_list, results = await gie(link_dict, text, base_url)
|
||||
|
||||
if related_urls:
|
||||
await context.add_requests(list(related_urls))
|
||||
if is_list and results:
|
||||
new_urls = [url for url in results if url != base_url and
|
||||
url != context.request.url and
|
||||
url not in existing_urls]
|
||||
if new_urls:
|
||||
await context.add_requests(new_urls)
|
||||
existing_urls.update(new_urls)
|
||||
return
|
||||
|
||||
if results:
|
||||
await save_to_pb(context.request.url, web_title, results)
|
||||
|
||||
# todo: use llm to determine next action
|
||||
"""
|
||||
|
98
core/scrapers/README.md
Normal file
98
core/scrapers/README.md
Normal file
@ -0,0 +1,98 @@
|
||||
# wiseflow 自定义解析器说明
|
||||
|
||||
## 概述
|
||||
|
||||
wiseflow 致力于利用大模型的理解和分析能力自动化爬虫与信息提取流程,即所谓的“爬查一体”架构, 因此 scraper 在 wiseflow 产品语境内与传统定位稍有不同,在这里它仅是指将爬虫获取的已渲染的 html 转化为便于大模型“阅读和理解”的数据前处理过程。
|
||||
|
||||
它的输入是 已渲染的 html 编码(配合网站的 base_url),输出是三个值:
|
||||
|
||||
- `action_dict`:页面中可执行的元素项目,类型为 `dict`,格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'item_name': {"type": 交互类型, "values": ["value1", "value2"]}
|
||||
}
|
||||
```
|
||||
|
||||
这个内容供 llm 判断是否需要采取行动以及采取什么行动,从而可以进一步获得需要的信息。
|
||||
|
||||
- `link_dict`:页面中所有链接的字典,类型为 `dict`,格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'link_key': "link_url"
|
||||
}
|
||||
```
|
||||
|
||||
其中 key 是页面链接的唯一编号,general_scraper 中使用顺序编号生成,比如 url1, url2, url3 等。
|
||||
|
||||
这个内容供 llm 从中提取可能包含用户关注点信息的链接,被提取的链接会被加入爬虫获取队列。
|
||||
|
||||
- `text`:页面中所有文字的拼接,类型为 `str`
|
||||
|
||||
llm 从中提取用户关注点信息。
|
||||
|
||||
|
||||
*有关这个机制其实我做过很多实验,包括让 llm 直接分析 html 编码(去除不必要标签或者分段),以及使用视觉大模型结合网页截图进行分析,最终发现综合考虑实现复杂度、抗干扰和异常能力、时间成本以及效果,目前方案是最优的。其中视觉大模型方案虽然我一直很期待,但实践中发现获取无干扰的网页截图并不容易,需要考虑各种情况,比如 cookie 警告等(很多 cookie 警告并不是简单的弹窗)、窗口大小设定以及因此带来的可能的多轮处理等……最关键的是,最终我发现如果网页文本处理得当,纯文本分析的效果是优于视觉大模型的……当然这里面的关键就是对 html 的处理,你不能指望把整个 html直接喂给 llm,那自然得不到好的结果,甚至粗糙的提取全部的文本和链接送给 llm 也很难得到好的结果,并且这样还会很烦 token,且拉长单次处理时间……*
|
||||
|
||||
wiseflow 目前搭配的 general_scraper 包括了对诸多常见 html 元素的处理,包括表格、列表、图片、视频、音频、表单、按钮、链接、文本等, 它能将这些元素统一转为适合 llm “阅读和理解”的文本,尤其是对于链接的处理,general_scraper 将链接替换为带编号的tag,而不是直接在文字中混入 url,一方面避免了对 llm 的干扰,同时也降低了token 的消耗,并且保证了 url 的准确。
|
||||
|
||||
general_scraper 目前被测试适合绝大部分常见的页面,并且通过分析得到的 link_dict 和 text 比例特征,很容易判断出页面是列表页面还是文章页面。
|
||||
|
||||
不过如果对于特定站点需要使用特殊的提取逻辑,您可以参考 general_scraper 配置自定义解析器。
|
||||
|
||||
## 自定义解析器的开发:
|
||||
|
||||
自定义解析器需要满足如下规格,否则无法被主流程整合:
|
||||
|
||||
- Scraper 应该是一个函数(而不是类)。
|
||||
|
||||
- 入参:
|
||||
- `html`:渲染后的页面 html 编码,类型为 `str`,scraper 可以直接使用 `bs` `parsel`等库进行解析;
|
||||
- ` base_url`:当前页面的 base url 地址,类型为 `str`(仅是为了特殊操作,用不到的话可以直接忽略)。
|
||||
|
||||
*目前 wiseflow 仅支持使用内置的爬虫(Crawlee['playwright_crawler'])进行页面获取,因此入参都是由主流程传递给 scraper 的。*
|
||||
|
||||
- 出参:
|
||||
|
||||
- `action_dict`:页面中可执行的元素项目,类型为 `dict`,格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'item_name': {"type": 交互类型, "values": ["value1", "value2"]}
|
||||
}
|
||||
```
|
||||
|
||||
这个内容供 llm 判断是否需要采取行动以及采取什么行动,从而可以进一步获得需要的信息。
|
||||
|
||||
- `link_dict`:页面中所有链接的字典,类型为 `dict`,格式如下:
|
||||
|
||||
```python
|
||||
{
|
||||
'link_key': "link_url"
|
||||
}
|
||||
```
|
||||
|
||||
- `text`:页面中所有文字的拼接,类型为 `str`
|
||||
|
||||
llm 从中提取用户关注点信息。
|
||||
|
||||
## 自定义解析器的注册
|
||||
|
||||
在 `core/scrapers/__init__.py` 中注册,参考:
|
||||
|
||||
```python
|
||||
from .mp import mp_scarper
|
||||
|
||||
customer_scrapers = {'mp.weixin.qq.com': mp_scarper}
|
||||
```
|
||||
|
||||
注意键使用域名,可以使用 `urllib.parse` 获取:
|
||||
|
||||
|
||||
```python
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed_url = urlparse("site's url")
|
||||
domain = parsed_url.netloc
|
||||
```
|
97
core/scrapers/README_EN.md
Normal file
97
core/scrapers/README_EN.md
Normal file
@ -0,0 +1,97 @@
|
||||
# Wiseflow Custom Scraper Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
wiseflow aims to automate web crawling and information extraction processes using large language models' understanding and analysis capabilities, known as the "integrated crawling and analysis" architecture. Therefore, the scraper in wiseflow's product context differs slightly from traditional positioning - here it only refers to the data preprocessing that converts rendered HTML from web crawlers into a format suitable for large models to "read and understand".
|
||||
|
||||
Its input is rendered HTML code (along with the website's base_url), and it outputs three values:
|
||||
|
||||
- `action_dict`: Executable elements on the page, type `dict`, formatted as follows:
|
||||
|
||||
```python
|
||||
{
|
||||
'item_name': {"type": interaction type, "values": ["value1", "value2"]}
|
||||
}
|
||||
```
|
||||
|
||||
This content allows the LLM to determine whether actions need to be taken and what actions to take, in order to obtain further required information.
|
||||
|
||||
- `link_dict`: Dictionary of all links on the page, type `dict`, formatted as follows:
|
||||
|
||||
```python
|
||||
{
|
||||
'link_key': "link_url"
|
||||
}
|
||||
```
|
||||
|
||||
where the key is a unique identifier for the page link. In general_scraper, sequential numbering is used to generate these, such as url1, url2, url3, etc.
|
||||
|
||||
This content allows the LLM to extract links that may contain information relevant to user interests. The extracted links are added to the crawler's queue.
|
||||
|
||||
- `text`: Concatenation of all text on the page, type `str`
|
||||
|
||||
The LLM extracts user-relevant information from this.
|
||||
|
||||
*I've actually done many experiments with this mechanism, including having the LLM directly analyze HTML code (removing unnecessary tags or segmenting), and using visual large models combined with webpage screenshots for analysis. After considering implementation complexity, interference and exception handling capabilities, time cost, and effectiveness, the current approach proved optimal. While I've always been excited about the visual large model approach, in practice I found that obtaining interference-free webpage screenshots isn't easy - you need to consider various scenarios like cookie warnings (many of which aren't simple popups), window size settings, and the resulting potential multi-round processing... Most crucially, I discovered that with proper webpage text processing, pure text analysis performs better than visual large models... Of course, the key here is HTML processing - you can't expect good results from feeding entire HTML directly to the LLM, and even rough extraction of all text and links sent to the LLM rarely yields good results, while also consuming more tokens and extending processing time...*
|
||||
|
||||
The general_scraper currently used by wiseflow includes processing for many common HTML elements, including tables, lists, images, videos, audio, forms, buttons, links, text, etc. It can convert these elements uniformly into text suitable for LLM "reading and understanding". Particularly for link processing, general_scraper replaces links with numbered tags rather than mixing URLs directly into the text, which both avoids interference with the LLM and reduces token consumption while ensuring URL accuracy.
|
||||
|
||||
general_scraper has been tested and found suitable for most common pages, and through analysis of the link_dict and text proportion characteristics, it's easy to determine whether a page is a list page or an article page.
|
||||
|
||||
However, if special extraction logic is needed for specific sites, you can refer to general_scraper to configure custom scraper.
|
||||
|
||||
## Custom Scraper Development:
|
||||
|
||||
Custom scraper must meet the following specifications to be integrated into the main process:
|
||||
|
||||
- The Scraper should be a function (not a class).
|
||||
|
||||
- Input parameters:
|
||||
- `html`: Rendered page HTML code, type `str`. The scraper can directly use libraries like `bs` or `parsel` for parsing;
|
||||
- `base_url`: The current page's base URL address, type `str` (only for special operations, can be ignored if not needed).
|
||||
|
||||
*Currently wiseflow only supports using the built-in crawler (Crawlee['playwright_crawler']) for page retrieval, so input parameters are all passed to the scraper by the main process.*
|
||||
|
||||
- Output parameters:
|
||||
|
||||
- `action_dict`: Executable elements on the page, type `dict`, formatted as follows:
|
||||
|
||||
```python
|
||||
{
|
||||
'item_name': {"type": interaction type, "values": ["value1", "value2"]}
|
||||
}
|
||||
```
|
||||
|
||||
This content allows the LLM to determine whether actions need to be taken and what actions to take, in order to obtain further required information.
|
||||
|
||||
- `link_dict`: Dictionary of all links on the page, type `dict`, formatted as follows:
|
||||
|
||||
```python
|
||||
{
|
||||
'link_key': "link_url"
|
||||
}
|
||||
```
|
||||
|
||||
- `text`: Concatenation of all text on the page, type `str`
|
||||
|
||||
The LLM extracts user-relevant information from this.
|
||||
|
||||
## Custom Scraper Registration
|
||||
|
||||
Register in `core/scrapers/__init__.py`, for example:
|
||||
|
||||
```python
|
||||
from .mp import mp_scarper
|
||||
|
||||
customer_scrapers = {'mp.weixin.qq.com': mp_scarper}
|
||||
```
|
||||
|
||||
Note that the key should use the domain name, which can be obtained using `urllib.parse`:
|
||||
|
||||
|
||||
```python
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed_url = urlparse("site's url")
|
||||
domain = parsed_url.netloc
|
||||
```
|
4
core/scrapers/__init__.py
Normal file
4
core/scrapers/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from .general_scraper import general_scraper
|
||||
|
||||
|
||||
custom_scrapers = {}
|
290
core/scrapers/general_scraper.py
Normal file
290
core/scrapers/general_scraper.py
Normal file
@ -0,0 +1,290 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import os
|
||||
|
||||
def general_scraper(html: str, base_url: str) -> tuple[dict, dict, str]:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# remove common elements
|
||||
for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
element.decompose()
|
||||
|
||||
action_dict = {}
|
||||
# handle form elements
|
||||
for form in soup.find_all('form', recursive=True):
|
||||
form_dict = {}
|
||||
for input_elem in form.find_all('input'):
|
||||
input_type = input_elem.get('type', 'text')
|
||||
input_name = input_elem.get('name', f'input_{len(action_dict)}')
|
||||
input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
|
||||
input_dict = {
|
||||
"type": input_type,
|
||||
"values": [input_value] if input_value else []
|
||||
}
|
||||
|
||||
# handle datalist
|
||||
if input_elem.get('list'):
|
||||
datalist = soup.find('datalist', id=input_elem['list'])
|
||||
if datalist:
|
||||
options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
|
||||
input_dict = {
|
||||
"type": "text",
|
||||
"values": [f"one of followings: {options}"]
|
||||
}
|
||||
|
||||
form_dict[input_name] = input_dict
|
||||
|
||||
for select in form.find_all('select'):
|
||||
select_name = select.get('name', f'select_{len(form_dict)}')
|
||||
options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
|
||||
form_dict[select_name] = {
|
||||
"type": "select",
|
||||
"values": options
|
||||
}
|
||||
|
||||
for textarea in form.find_all('textarea'):
|
||||
textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
|
||||
form_dict[textarea_name] = {
|
||||
"type": "textarea",
|
||||
"values": [textarea.text.strip()]
|
||||
}
|
||||
|
||||
if form_dict:
|
||||
form_id = form.get('id', f'form_{len(action_dict)}')
|
||||
action_dict[form_id] = form_dict
|
||||
|
||||
form.decompose()
|
||||
|
||||
# handle input elements that are not in any form
|
||||
for input_elem in soup.find_all('input', recursive=True):
|
||||
if input_elem.find_parent('form') is None:
|
||||
# check if the input is associated with a form by form attribute
|
||||
form_ids = input_elem.get('form', '').split()
|
||||
|
||||
# handle input element
|
||||
input_type = input_elem.get('type', 'text')
|
||||
input_name = input_elem.get('name', f'input_{len(action_dict)}')
|
||||
input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
|
||||
input_dict = {
|
||||
"type": input_type,
|
||||
"values": [input_value] if input_value else []
|
||||
}
|
||||
|
||||
# handle datalist
|
||||
if input_elem.get('list'):
|
||||
datalist = soup.find('datalist', id=input_elem['list'])
|
||||
if datalist:
|
||||
options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
|
||||
input_dict = {
|
||||
"type": "text",
|
||||
"values": [f"one of followings: {options}"]
|
||||
}
|
||||
|
||||
# decide the placement of the input element based on form attribute
|
||||
if form_ids:
|
||||
for form_id in form_ids:
|
||||
if form_id in action_dict:
|
||||
action_dict[form_id][input_name] = input_dict
|
||||
else:
|
||||
action_dict[form_id] = {input_name: input_dict}
|
||||
else:
|
||||
action_dict[input_name] = {"input": input_dict}
|
||||
|
||||
input_elem.decompose()
|
||||
|
||||
for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
|
||||
button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
|
||||
if not button_name:
|
||||
button_name = f'button_{len(action_dict)}'
|
||||
|
||||
button_type = button.get('type', 'button')
|
||||
button_value = button.get('value', button.text.strip())
|
||||
|
||||
action_dict[button_name] = {
|
||||
"button": {
|
||||
"type": button_type,
|
||||
"values": [button_value] if button_value else []
|
||||
}
|
||||
}
|
||||
|
||||
button.decompose()
|
||||
|
||||
# handle command elements
|
||||
for command in soup.find_all('command', recursive=True):
|
||||
command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
|
||||
if not command_name:
|
||||
command_name = f'command_{len(action_dict)}'
|
||||
|
||||
command_type = command.get('type', 'command')
|
||||
command_value = command.get('value', command.text.strip())
|
||||
|
||||
action_dict[command_name] = {
|
||||
"command": {
|
||||
"type": command_type,
|
||||
"values": [command_value] if command_value else []
|
||||
}
|
||||
}
|
||||
|
||||
command.decompose()
|
||||
|
||||
link_dict = {}
|
||||
for img in soup.find_all('img', src=True, recursive=True):
|
||||
src = img.get('src')
|
||||
if src.startswith('#') or src.startswith('about:blank'):
|
||||
src = None
|
||||
text = img.get('alt', '').strip()
|
||||
if src:
|
||||
if not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
key = f"url{len(link_dict)}"
|
||||
link_dict[key] = src
|
||||
text = f"{text}<img>[{key}]"
|
||||
|
||||
# find all area urls related to this img
|
||||
area_urls = set()
|
||||
if img.get('usemap'):
|
||||
# remove the # at the beginning of the map name
|
||||
map_name = img.get('usemap').lstrip('#')
|
||||
# find the map tag
|
||||
map_tag = soup.find('map', {'name': map_name})
|
||||
if map_tag:
|
||||
# get all area tags under the map
|
||||
for area in map_tag.find_all('area', href=True):
|
||||
area_href = area.get('href')
|
||||
if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'):
|
||||
area_href = None
|
||||
if area_href:
|
||||
if not area_href.startswith(('http://', 'https://')):
|
||||
area_href = urljoin(base_url, area_href)
|
||||
area_urls.add(area_href)
|
||||
area.decompose()
|
||||
# delete the whole map tag
|
||||
map_tag.decompose()
|
||||
for area_url in area_urls:
|
||||
key = f"url{len(link_dict)}"
|
||||
link_dict[key] = area_url
|
||||
text = f"{text}[{key}]"
|
||||
|
||||
img.replace_with(f"-{text}")
|
||||
|
||||
for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True):
|
||||
src = media.get('src')
|
||||
if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'):
|
||||
src = None
|
||||
text = media.get('alt', '').strip() or media.get_text().strip()
|
||||
if src:
|
||||
# convert relative path to full url
|
||||
if not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
key = f"url{len(link_dict)}"
|
||||
link_dict[key] = src
|
||||
ext = os.path.splitext(src)[1].lstrip('.') or media.name
|
||||
text = f"{text}<{ext}>[{key}]"
|
||||
|
||||
media.replace_with(f"-{text}")
|
||||
|
||||
for obj in soup.find_all('object', data=True, recursive=True):
|
||||
data = obj.get('data')
|
||||
if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'):
|
||||
data = None
|
||||
text = obj.get('title', '').strip() or obj.get_text().strip()
|
||||
if data:
|
||||
# convert relative path to full url
|
||||
if not data.startswith(('http://', 'https://')):
|
||||
data = urljoin(base_url, data)
|
||||
key = f"url{len(link_dict)}"
|
||||
link_dict[key] = data
|
||||
ext = os.path.splitext(data)[1].lstrip('.') or 'object'
|
||||
text = f"{text}<{ext}>[{key}]"
|
||||
|
||||
obj.replace_with(f"-{text}")
|
||||
|
||||
# process links at last, so that we can keep the image and media info in the link
|
||||
for a in soup.find_all('a', href=True, recursive=True):
|
||||
href = a.get('href')
|
||||
if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'):
|
||||
href = None
|
||||
if href:
|
||||
text = a.get_text().strip() or '-'
|
||||
if not href.startswith(('http://', 'https://')):
|
||||
href = urljoin(base_url, href)
|
||||
key = f"url{len(link_dict)}"
|
||||
link_dict[key] = href
|
||||
a.replace_with(f"{text}[{key}]")
|
||||
|
||||
# handle headings
|
||||
for i in range(1, 7): # h1 到 h6
|
||||
for heading in soup.find_all(f'h{i}', recursive=False):
|
||||
text = heading.get_text().strip()
|
||||
heading.replace_with(f"{'#' * i} {text}\n")
|
||||
|
||||
# replace all <br> and <br/> tags with newlines
|
||||
for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True):
|
||||
br.replace_with('\n')
|
||||
|
||||
# handle lists
|
||||
for list_tag in soup.find_all(['ul', 'ol'], recursive=True):
|
||||
list_text = []
|
||||
for idx, item in enumerate(list_tag.find_all('li')):
|
||||
list_text.append(f"{idx + 1}. {item.get_text().strip()}")
|
||||
list_text = '\t'.join(list_text)
|
||||
list_tag.replace_with(f"{list_text}\n")
|
||||
|
||||
# handle spans - merge span text with surrounding text
|
||||
for span in soup.find_all('span', recursive=True):
|
||||
span.replace_with(span.get_text().strip())
|
||||
|
||||
# handle strikethrough text
|
||||
for del_tag in soup.find_all(['del', 's'], recursive=True):
|
||||
del_text = del_tag.get_text().strip()
|
||||
if del_text:
|
||||
del_tag.replace_with(f"{del_text}(maybe_outdated)")
|
||||
else:
|
||||
del_tag.decompose()
|
||||
|
||||
# handle tables
|
||||
for table in soup.find_all('table', recursive=True):
|
||||
table_text = []
|
||||
|
||||
# handle caption
|
||||
caption = table.find('caption')
|
||||
if caption:
|
||||
table_text.append(caption.get_text().strip())
|
||||
|
||||
# get headers
|
||||
headers = []
|
||||
for th in table.find_all('th'):
|
||||
headers.append(th.get_text().strip())
|
||||
|
||||
# handle all rows (including tbody and tfoot)
|
||||
for row in table.find_all('tr'):
|
||||
# get the first cell value
|
||||
# try to find th as first_val
|
||||
first_cell = row.find(['th', 'td'])
|
||||
if not first_cell:
|
||||
continue
|
||||
first_val = first_cell.get_text().strip()
|
||||
cells = row.find_all('td')
|
||||
if not cells:
|
||||
continue
|
||||
|
||||
# handle remaining cells
|
||||
for idx, cell in enumerate(cells):
|
||||
cell_text = cell.get_text().strip()
|
||||
if not cell_text or cell_text == first_val:
|
||||
continue
|
||||
|
||||
header_text = headers[idx] if idx < len(headers) else ''
|
||||
cell_str = f"{first_val}-{header_text}-{cell_text}"
|
||||
table_text.append(cell_str)
|
||||
|
||||
# replace the table with the processed text
|
||||
table_text = '\n'.join(table_text)
|
||||
table.replace_with(f"\n{table_text}\n")
|
||||
|
||||
html_text = soup.get_text(strip=False, separator='\n')
|
||||
|
||||
return action_dict, link_dict, html_text
|
||||
|
29
pb/pb_migrations/1735270046_updated_infos.js
Normal file
29
pb/pb_migrations/1735270046_updated_infos.js
Normal file
@ -0,0 +1,29 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_629947526")
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(6, new Field({
|
||||
"autogeneratePattern": "",
|
||||
"hidden": false,
|
||||
"id": "text4002551580",
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
"name": "url_title",
|
||||
"pattern": "",
|
||||
"presentable": false,
|
||||
"primaryKey": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "text"
|
||||
}))
|
||||
|
||||
return app.save(collection)
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_629947526")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("text4002551580")
|
||||
|
||||
return app.save(collection)
|
||||
})
|
@ -31,16 +31,16 @@ def find_article_or_list(link_dict, text) -> (bool, bool, str):
|
||||
text_no_urls = re.sub(r'\[url\d+]', '', text_no_tags)
|
||||
content_length = len(text_no_urls)
|
||||
|
||||
valid_url_count = 0
|
||||
valid_url = set()
|
||||
for url in link_dict.values():
|
||||
url_lower = url.lower()
|
||||
has_common_ext = any(url_lower.endswith(ext) for ext in common_file_exts)
|
||||
has_common_tld = any(url_lower.endswith(tld) or url_lower.endswith(tld + '/') for tld in common_tlds)
|
||||
if not has_common_ext and not has_common_tld:
|
||||
valid_url_count += 1
|
||||
valid_url.add(url)
|
||||
|
||||
valid_url_rate = valid_url_count / content_length
|
||||
is_list = valid_url_rate > 0.007 and valid_url_count > valid_list_min_length
|
||||
valid_url_rate = len(valid_url) / content_length
|
||||
is_list = valid_url_rate > 0.007 and len(valid_url) > valid_list_min_length
|
||||
need_more_info = content_length < min_content_length
|
||||
return is_list, need_more_info, text
|
||||
|
||||
|
@ -8,7 +8,7 @@ text_info_system = '''作为信息提取助手,你的任务是从给定的网
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围
|
||||
- 忠于原文,你的任务是从网页文本中抽取相关信息,而不是提炼、总结和改写
|
||||
- 对于最终输出的信息,请保证主体、时间、地点等关键要素的清晰明确,为此可能需要综合上下文进行提取
|
||||
- 如果提取的内容中包括类似“<mp4>”、“[url1]”这样的片段,务必保留'''
|
||||
- 如果提取的内容中包括类似“<mp4>”、“[url1]”这样的片段,务必原样保留'''
|
||||
|
||||
text_info_suffix = '''请先复述一遍关注点及其解释,再对原文进行分析。如果网页文本中包含关注点相关的内容,请按照以下json格式输出提取的信息:
|
||||
{"focus": 关注点名称, "content": 提取的内容}
|
||||
|
Loading…
Reference in New Issue
Block a user