mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
v0.3.6 mockup
This commit is contained in:
parent
86cabc4e28
commit
1f9b6d5d6c
@ -1,49 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from loguru import logger
|
||||
import os, re, sys
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
import json_repair
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir) # get parent dir
|
||||
sys.path.append(project_root)
|
||||
|
||||
from utils.deep_scraper import common_chars, common_file_exts, common_tlds
|
||||
from utils.pb_api import PbTalker
|
||||
from llms.openai_wrapper import openai_llm as llm
|
||||
# from llms.siliconflow_wrapper import sfa_llm # or other llm wrapper
|
||||
from utils.general_utils import is_chinese, extract_and_convert_dates, extract_urls
|
||||
from core.utils.pb_api import PbTalker
|
||||
from core.llms.openai_wrapper import openai_llm as llm
|
||||
# from core.llms.siliconflow_wrapper import sfa_llm # or other llm wrapper
|
||||
from core.utils.general_utils import is_chinese, extract_and_convert_dates
|
||||
|
||||
|
||||
async def get_author_and_publish_date(text: str) -> tuple[str, str]:
|
||||
async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]:
|
||||
if not text:
|
||||
return "", ""
|
||||
|
||||
if len(text) > 1024:
|
||||
text = f'{text[:500]}......{text[-500:]}'
|
||||
text = f'{text[:999]}......'
|
||||
|
||||
system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
|
||||
suffix = '''Please output the extracted information in the following JSON format:
|
||||
{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}'''
|
||||
suffix = '''Please output the extracted information in the following format(output only the result, no other content):
|
||||
"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
|
||||
|
||||
content = f'<text>\n{text}\n</text>\n\n{suffix}'
|
||||
llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.secondary_model, max_tokens=50, temperature=0.1, response_format={"type": "json_object"})
|
||||
model=model, max_tokens=50, temperature=0.1, response_format={"type": "json_object"})
|
||||
|
||||
if not llm_output:
|
||||
ap_ = llm_output.strip().strip('"').strip('//')
|
||||
|
||||
if '//' not in ap_:
|
||||
print(f"failed to parse from llm output: {ap_}")
|
||||
return '', ''
|
||||
|
||||
result = json_repair.repair_json(llm_output, return_objects=True)
|
||||
ap = ap_.split('//')
|
||||
|
||||
if not isinstance(result, dict):
|
||||
print("failed to parse from llm output")
|
||||
return '', ''
|
||||
if 'source' not in result or 'publish_date' not in result:
|
||||
print("failed to parse from llm output")
|
||||
return '', ''
|
||||
|
||||
return result['source'], extract_and_convert_dates(result['publish_date'])
|
||||
return ap[0], extract_and_convert_dates(ap[1])
|
||||
|
||||
|
||||
async def extract_info_from_img(task: list, vl_model: str) -> dict:
|
||||
@ -63,19 +55,10 @@ class GeneralInfoExtractor:
|
||||
self.pb = pb
|
||||
self.logger = _logger
|
||||
self.model = os.environ.get("PRIMARY_MODEL", "")
|
||||
self.secondary_model = os.environ.get("SECONDARY_MODEL", "")
|
||||
|
||||
if not self.model:
|
||||
self.logger.error("PRIMARY_MODEL not set, can't continue")
|
||||
raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
|
||||
|
||||
if not self.secondary_model:
|
||||
self.logger.warning("SECONDARY_MODEL not set, will use primary model for secondary model, pls attention the request rate limit")
|
||||
self.secondary_model = self.model
|
||||
|
||||
self.vl_model = os.environ.get("VL_MODEL", "")
|
||||
if not self.vl_model:
|
||||
self.logger.warning("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
|
||||
|
||||
# collect tags user set in pb database and determin the system prompt language based on tags
|
||||
focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
|
||||
@ -93,268 +76,182 @@ class GeneralInfoExtractor:
|
||||
for item in focus_data:
|
||||
tag = item["focuspoint"]
|
||||
expl = item["explanation"]
|
||||
focus_statement = f"{focus_statement}#{tag}\n"
|
||||
focus_statement = f"{focus_statement}//{tag}//\n"
|
||||
if expl:
|
||||
focus_statement = f"{focus_statement}解释:{expl}\n"
|
||||
if is_chinese(expl):
|
||||
focus_statement = f"{focus_statement}解释:{expl}\n"
|
||||
else:
|
||||
focus_statement = f"{focus_statement}Explanation: {expl}\n"
|
||||
|
||||
if is_chinese(focus_statement):
|
||||
self.get_info_prompt = f'''作为信息提取助手,你的任务是从给定的网页文本中抽取任何与下列关注点之一相关的信息。关注点列表及其解释如下:
|
||||
self.get_info_prompt = f'''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
|
||||
|
||||
{focus_statement}\n
|
||||
在进行信息提取时,请遵循以下原则:
|
||||
在提炼摘要时,请遵循以下原则:
|
||||
- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围
|
||||
- 摘要应当详实、充分
|
||||
- 摘要信息务必忠于原文'''
|
||||
|
||||
- 理解每个关注点的含义,确保提取的内容至少与其中之一相关
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围
|
||||
- 忠于原文,你的任务是从网页文本中抽取相关信息,而不是提炼、总结和改写
|
||||
- 对于最终输出的信息,请保证主体、时间、地点等关键要素的清晰明确,为此可能需要综合上下文进行提取
|
||||
- 如果提取的内容中包括类似“<mp4>”、“[url1]”这样的片段,务必原样保留'''
|
||||
|
||||
self.get_info_suffix = '''请先复述一遍关注点及其解释,再对原文进行分析。如果网页文本中包含关注点相关的内容,请按照以下json格式输出提取的信息:
|
||||
{"focus": 关注点名称, "content": 提取的内容}
|
||||
|
||||
如果有多条相关信息,请按一行一条的格式输出,最终输出的结果整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
|
||||
self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
|
||||
"""
|
||||
{"focus": 关注点1名称, "content": 提取内容1}
|
||||
{"focus": 关注点2名称, "content": 提取内容2}
|
||||
//关注点1//
|
||||
摘要1
|
||||
//关注点2//
|
||||
摘要2
|
||||
//关注点3//
|
||||
NA
|
||||
...
|
||||
"""
|
||||
|
||||
如果网页文本中不包含任何相关的信息,请保证三引号内为空。'''
|
||||
|
||||
self.get_more_link_prompt = f'''你将被给到一段处理过的网页文本,在这些文本中所有的url链接都已经被替换为类似"[url120]"这样的标签,并置于与其关联的文本后面。
|
||||
你的任务是从网页文本中抽取任何与下列关注点之一相关的文本片段。关注点列表及其解释如下:
|
||||
"""'''
|
||||
self.get_more_link_prompt = f'''你将被给到数行格式为"<编号>//内容//"的文本,你的任务是逐条分析这些文本,并分别与如下关注点之一相关联。关注点列表及其解释如下:
|
||||
|
||||
{focus_statement}\n
|
||||
在进行抽取时,请遵循以下原则:
|
||||
在进行关联分析时,请遵循以下原则:
|
||||
|
||||
- 理解每个关注点的含义,确保提取的内容至少与其中之一相关
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围
|
||||
- 只抽取以标签(类似"[url120]"这样)结尾的文本片段
|
||||
- 维持抽取出的文本片段的原样,尤其不要遗漏其后的标签'''
|
||||
- 理解每个关注点的含义
|
||||
- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围'''
|
||||
|
||||
self.get_more_link_suffix = '''请先复述一遍关注点及其解释,再对原文逐行进行抽取,最终将挑选出的文本片段按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
|
||||
self.get_more_link_suffix = '''请分行逐条输出结果,每一条的输出格式为"<编号>//关注点名称//",如果某条内容不与任何关注点相关,请输出"<编号>//NA//"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例:
|
||||
"""
|
||||
文本1
|
||||
文本2
|
||||
<t1>//关注点1名称//
|
||||
<t2>//关注点2名称//
|
||||
<t3>//NA//
|
||||
...
|
||||
"""'''
|
||||
|
||||
self.info_judge_prompt = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹,网页文本则用<text></text>包裹。请遵循如下工作流程:
|
||||
1、尝试找出网页文本中所有与信息相关的片段(有多少找多少,没有的话则跳过);
|
||||
2、判断信息是否与这些片段在关键要素上一致,请特别注意主语、日期、地点以及数字这些。'''
|
||||
|
||||
self.info_judge_suffix = '先输出找到的所有文本片段,再输出最终结论(仅为“是”或“否”)'
|
||||
else:
|
||||
self.get_info_prompt = f'''As an information extraction assistant, your task is to extract any information from the given webpage text that relates to at least one of the following focus points. The list of focus points and their explanations are as follows:
|
||||
self.get_info_prompt = f'''You will be given a webpage text wrapped in <text></text> tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows:
|
||||
|
||||
{focus_statement}\n
|
||||
When extracting information, please follow these principles:
|
||||
When extracting summaries, please follow these principles:
|
||||
- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
|
||||
- The summary should be detailed and comprehensive
|
||||
- The summary should be faithful to the original text'''
|
||||
|
||||
- Understand the meaning of each focus point and ensure the extracted content relates to at least one of them
|
||||
- If a focus point has further explanations, ensure the extracted content aligns with those explanations
|
||||
- Stay faithful to the original text - your task is to extract relevant information, not to refine, summarize or rewrite
|
||||
- For the final output, ensure key elements like subject, time, location etc. are clearly specified, which may require synthesizing context
|
||||
- If the extracted content includes fragments like "<mp4>" or "[url1]", make sure to preserve them exactly as they appear'''
|
||||
|
||||
self.get_info_suffix = '''First, please restate the focus points and their explanations, then analyze the original text. If the webpage text contains content related to the focus points, please output the extracted information in the following JSON format:
|
||||
{"focus": focus point name, "content": extracted content}
|
||||
|
||||
If there are multiple relevant pieces of information, output them one per line, with the entire output wrapped in triple quotes. There should be no other content within the triple quotes. Here is an example of the output format:
|
||||
self.get_info_suffix = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
|
||||
"""
|
||||
{"focus": focus point 1 name, "content": extracted content 1}
|
||||
{"focus": focus point 2 name, "content": extracted content 2}
|
||||
...
|
||||
"""
|
||||
|
||||
If the webpage text does not contain any relevant information, ensure the content within the triple quotes is empty.'''
|
||||
|
||||
self.get_more_link_prompt = f'''You will be given a processed webpage text where all URL links have been replaced with tags like "[url120]" and placed after their associated text.
|
||||
Your task is to extract any text fragments from the webpage text that relate to any of the following focus points. The list of focus points and their explanations are as follows:
|
||||
|
||||
{focus_statement}\n
|
||||
When extracting, please follow these principles:
|
||||
|
||||
- Understand the meaning of each focus point and ensure the extracted content relates to at least one of them
|
||||
- If a focus point has further explanations, ensure the extracted content aligns with those explanations
|
||||
- Only extract text fragments that end with tags (like "[url120]")
|
||||
- Maintain the text fragments exactly as they appear, especially don't omit their trailing tags'''
|
||||
|
||||
self.get_more_link_suffix = '''First, please restate the focus points and their explanations, then analyze the original text line by line. Finally, output the selected text fragments one per line, with the entire output wrapped in triple quotes. There should be no other content within the triple quotes. Here is an example of the output format:
|
||||
"""
|
||||
text1
|
||||
text2
|
||||
//Focus Point 1//
|
||||
Summary 1
|
||||
//Focus Point 2//
|
||||
Summary 2
|
||||
//Focus Point 3//
|
||||
NA
|
||||
...
|
||||
"""'''
|
||||
|
||||
self.info_judge_prompt = '''Determine whether the given information matches the webpage text. The information will be wrapped in <info></info> tags, and the webpage text will be wrapped in <text></text> tags. Please follow this workflow:
|
||||
1. Try to find all text fragments in the webpage text that are related to the information (find as many as possible, skip if none);
|
||||
2. Determine whether the information is consistent with these fragments in key elements, paying special attention to subjects, dates, locations, and numbers.'''
|
||||
self.get_more_link_prompt = f'''You will be given several lines of text in the format "<index>//content//". Your task is to analyze each line and associate it with one of the following focus points. The list of focus points and their explanations are as follows:
|
||||
|
||||
self.info_judge_suffix = 'First, output all found text fragments, then output the final conclusion (only "Y" or "N").'
|
||||
{focus_statement}\n
|
||||
When performing the association analysis, please follow these principles:
|
||||
|
||||
async def _generate_results(self, text: str, mode: str) -> set:
|
||||
- Understand the meaning of each focus point
|
||||
- If a focus point has further explanation, ensure the extracted content aligns with the scope of these explanations'''
|
||||
|
||||
self.get_more_link_suffix = '''Please output the results line by line. Each line should be in the format "<index>//focus point name//". If a line is not related to any focus point, output "<index>//NA//". The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
|
||||
"""
|
||||
<t1>//Focus Point 1//
|
||||
<t2>//Focus Point 2//
|
||||
<t3>//NA//
|
||||
...
|
||||
"""'''
|
||||
|
||||
async def _generate_results(self, lines: list, mode: str) -> set:
|
||||
if mode == 'get_info':
|
||||
system_prompt = self.get_info_prompt
|
||||
suffix = self.get_info_suffix
|
||||
batch_size = 2048
|
||||
batch_size = 5000
|
||||
elif mode == 'get_link':
|
||||
system_prompt = self.get_more_link_prompt
|
||||
suffix = self.get_more_link_suffix
|
||||
batch_size = 1024
|
||||
batch_size = 2048
|
||||
else:
|
||||
self.logger.error(f"unknown mode: {mode}")
|
||||
return set()
|
||||
|
||||
lines = text.split('\n')
|
||||
cache = set()
|
||||
text_batch = ''
|
||||
for line in lines:
|
||||
text_batch = f'{text_batch}\n{line}'
|
||||
text_batch = f'{text_batch}{line}\n'
|
||||
if len(text_batch) > batch_size:
|
||||
content = f'<text>\n{text_batch}\n</text>\n\n{suffix}'
|
||||
content = f'<text>\n{text_batch}</text>\n\n{suffix}'
|
||||
result = await llm(
|
||||
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f"llm output: {result}")
|
||||
# self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
if not result:
|
||||
self.logger.warning("bad generate result")
|
||||
text_batch = ''
|
||||
continue
|
||||
for item in result:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
item = item.split('\n')
|
||||
cache.update(item)
|
||||
cache.add(result[-1])
|
||||
text_batch = ''
|
||||
|
||||
if text_batch:
|
||||
content = f'<text>\n{text_batch}\n</text>\n\n{suffix}'
|
||||
content = f'<text>\n{text_batch}</text>\n\n{suffix}'
|
||||
result = await llm(
|
||||
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
|
||||
model=self.model, temperature=0.1)
|
||||
self.logger.debug(f"llm output: {result}")
|
||||
# self.logger.debug(f"llm output: {result}")
|
||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||
if not result:
|
||||
self.logger.warning("bad generate result")
|
||||
return cache
|
||||
for item in result:
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
item = item.split('\n')
|
||||
cache.update(item)
|
||||
cache.add(result[-1])
|
||||
return cache
|
||||
|
||||
async def get_more_related_urls(self, link_dict: dict, text: str) -> list[str]:
|
||||
raw_result = await self._generate_results(text, 'get_link')
|
||||
final_result = set()
|
||||
for item in raw_result:
|
||||
if '[url' not in item:
|
||||
continue
|
||||
url_tags = re.findall(r'\[url\d+]', item)
|
||||
if not url_tags:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
if url_tag not in link_dict:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
result_url = link_dict[url_tag]
|
||||
if any(result_url.endswith(tld) or result_url.endswith(tld + '/') for tld in common_tlds):
|
||||
continue
|
||||
if any(result_url.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
|
||||
continue
|
||||
final_result.add(result_url)
|
||||
return list(final_result)
|
||||
async def get_more_related_urls(self, link_dict: dict) -> set:
|
||||
_to_be_processed = []
|
||||
link_map = {}
|
||||
for i, (url, des) in enumerate(link_dict.items()):
|
||||
des = des.replace('\n', ' ')
|
||||
_to_be_processed.append(f'<t{i+1}>//{des}//')
|
||||
link_map[f'<t{i+1}'] = url
|
||||
|
||||
async def get_info(self, link_dict: dict, text: str, info_pre_fix: str) -> list[dict]:
|
||||
raw_result = await self._generate_results(text, 'get_info')
|
||||
raw_result = await self._generate_results(_to_be_processed, 'get_link')
|
||||
final_result = set()
|
||||
for result in raw_result:
|
||||
for item in result.split('\n'):
|
||||
if not item:
|
||||
continue
|
||||
segs = item.split('>')
|
||||
if len(segs) != 2:
|
||||
self.logger.debug(f"bad generate result: {item}")
|
||||
continue
|
||||
_index, focus = segs
|
||||
_index = _index.strip()
|
||||
focus = focus.strip().strip('//')
|
||||
if focus == 'NA':
|
||||
continue
|
||||
if focus not in self.focus_dict or _index not in link_map:
|
||||
self.logger.debug(f"bad generate result: {item}")
|
||||
continue
|
||||
self.logger.debug(f"{link_map[_index]} selected")
|
||||
final_result.add(link_map[_index])
|
||||
return final_result
|
||||
|
||||
async def get_info(self, text: str, text_links: dict, info_pre_fix: str) -> list[dict]:
|
||||
raw_result = await self._generate_results(text.split('\n'), 'get_info')
|
||||
final = []
|
||||
for item in raw_result:
|
||||
result = json_repair.repair_json(item, return_objects=True)
|
||||
if not isinstance(result, dict):
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not result:
|
||||
continue
|
||||
if 'focus' not in result or 'content' not in result:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not item['focus'] or item['focus'] not in self.focus_dict:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if not item['content']:
|
||||
self.logger.warning(f"bad generate result: {item}")
|
||||
continue
|
||||
if item['content'] in link_dict:
|
||||
continue
|
||||
segs = item.split('//')
|
||||
i = 0
|
||||
while i < len(segs) - 1:
|
||||
focus = segs[i].strip()
|
||||
if not focus:
|
||||
i += 1
|
||||
continue
|
||||
if focus not in self.focus_dict:
|
||||
self.logger.debug(f"bad generate result: {item}")
|
||||
i += 1
|
||||
continue
|
||||
content = segs[i+1].strip().strip('摘要').strip(':').strip(':')
|
||||
i += 2
|
||||
if not content or content == 'NA':
|
||||
continue
|
||||
"""
|
||||
maybe can use embedding retrieval to judge
|
||||
"""
|
||||
self.logger.debug(f"get info: {focus}: {content}")
|
||||
|
||||
judge = await llm([{'role': 'system', 'content': self.info_judge_prompt},
|
||||
{'role': 'user', 'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{self.info_judge_suffix}'}],
|
||||
model=self.secondary_model, temperature=0.1)
|
||||
self.logger.debug(f'judge llm output:\n{judge}')
|
||||
if not judge:
|
||||
self.logger.warning("failed to parse from llm output, skip checking")
|
||||
self.logger.info(f"<info>\n{item['content']}\n</info>\n\n<text>\n{text}\n</text>")
|
||||
self.logger.info(judge)
|
||||
content = item['content']
|
||||
url_tags = re.findall(r'\[url\d+]', content)
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
_url = link_dict.get(url_tag, '')
|
||||
if _url:
|
||||
content = content.replace(url_tag, _url)
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{content}"})
|
||||
continue
|
||||
url_tags = re.findall(r'\[(Ref_\d+)]', content)
|
||||
refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
|
||||
|
||||
to_save = False
|
||||
for i in range(min(7, len(judge))):
|
||||
char = judge[-1 - i]
|
||||
if char == '是' or char == 'Y':
|
||||
to_save = True
|
||||
break
|
||||
elif char == '否' or char == 'N':
|
||||
break
|
||||
if not to_save:
|
||||
self.logger.warning("secondary model judge not faithful to article text, aborting")
|
||||
self.logger.info(f"<info>\n{item['content']}\n</info>\n\n<text>\n{text}\n</text>")
|
||||
self.logger.info(judge)
|
||||
continue
|
||||
|
||||
content = item['content']
|
||||
url_tags = re.findall(r'\[url\d+]', content)
|
||||
for url_tag in url_tags:
|
||||
url_tag = url_tag[1:-1]
|
||||
_url = link_dict.get(url_tag, '')
|
||||
if _url:
|
||||
content = content.replace(url_tag, _url)
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{content}"})
|
||||
final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{content}", 'references': refences})
|
||||
|
||||
return final
|
||||
|
||||
async def __call__(self, link_dict: dict, text: str, base_url: str, author: str = None, publish_date: str = None) -> tuple[bool, list]:
|
||||
is_list, need_more_info, link_dict, text = find_article_or_list(link_dict, text)
|
||||
if is_list:
|
||||
self.logger.info("may be a article list page, get more urls ...")
|
||||
return True, await self.get_more_related_urls(link_dict, text)
|
||||
|
||||
if need_more_info:
|
||||
self.logger.info("may be a article page need to get more text from images...")
|
||||
text = await self._extract_info_from_img(text, link_dict)
|
||||
self.logger.debug(f"extended text: \n{text}\n")
|
||||
|
||||
if not author and not publish_date and text:
|
||||
author, publish_date = await self.get_author_and_publish_date(text)
|
||||
|
||||
if not author or author.lower() == 'na':
|
||||
author = urlparse(base_url).netloc
|
||||
|
||||
if not publish_date or publish_date.lower() == 'na':
|
||||
publish_date = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
async def __call__(self, link_dict: dict, text: str, text_links: dict, author: str, publish_date: str) -> tuple[set, list]:
|
||||
info_prefix = f"//{author} {publish_date}//"
|
||||
|
||||
return False, await self.get_info(link_dict, text, info_prefix)
|
||||
return await self.get_more_related_urls(link_dict), await self.get_info(text, text_links, info_prefix)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from .general_scraper import general_scraper
|
||||
|
||||
# from .xxx import xx_scraper
|
||||
# from .xxx import xx_config
|
||||
|
||||
custom_scrapers = {}
|
||||
crawl4ai_custom_configs = {}
|
@ -1,14 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from utils.pb_api import PbTalker
|
||||
from utils.general_utils import get_logger
|
||||
from agents.get_info import GeneralInfoExtractor
|
||||
from bs4 import BeautifulSoup
|
||||
from utils.general_utils import get_logger, extract_and_convert_dates
|
||||
from utils.deep_scraper import *
|
||||
from agents.get_info import *
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from scrapers import *
|
||||
from custom_fetchings import *
|
||||
from urllib.parse import urlparse
|
||||
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
@ -16,14 +16,20 @@ project_dir = os.environ.get("PROJECT_DIR", "")
|
||||
if project_dir:
|
||||
os.makedirs(project_dir, exist_ok=True)
|
||||
|
||||
os.environ['CRAWLEE_STORAGE_DIR'] = os.path.join(project_dir, 'crawlee_storage')
|
||||
screenshot_dir = os.path.join(project_dir, 'crawlee_storage', 'screenshots')
|
||||
wiseflow_logger = get_logger('general_process', project_dir)
|
||||
pb = PbTalker(wiseflow_logger)
|
||||
gie = GeneralInfoExtractor(pb, wiseflow_logger)
|
||||
one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
|
||||
existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
|
||||
|
||||
llm_model = os.environ.get("PRIMARY_MODEL", "")
|
||||
vl_model = os.environ.get("VL_MODEL", "")
|
||||
if not vl_model:
|
||||
wiseflow_logger.warning("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
|
||||
|
||||
img_to_be_recognized_pattern = r'§to_be_recognized_by_visual_llm_(.*?)§'
|
||||
recognized_img_cache = {}
|
||||
|
||||
|
||||
async def save_to_pb(url: str, url_title: str, infos: list):
|
||||
# saving to pb process
|
||||
@ -37,82 +43,113 @@ async def save_to_pb(url: str, url_title: str, infos: list):
|
||||
with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(info, f, ensure_ascii=False, indent=4)
|
||||
|
||||
default_crawler_config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
exclude_social_media_links=True,
|
||||
magic=True,
|
||||
scan_full_page=True,
|
||||
remove_overlay_elements=True
|
||||
)
|
||||
|
||||
crawler = PlaywrightCrawler(
|
||||
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
||||
# max_requests_per_crawl=1,
|
||||
max_request_retries=1,
|
||||
request_handler_timeout=timedelta(minutes=5),
|
||||
)
|
||||
async def main_process(sites: set):
|
||||
working_list = set()
|
||||
working_list.update(sites)
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
while working_list:
|
||||
url = working_list.pop()
|
||||
existing_urls.add(url)
|
||||
has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
|
||||
if has_common_ext:
|
||||
wiseflow_logger.info(f'{url} is a common file, skip')
|
||||
continue
|
||||
|
||||
@crawler.pre_navigation_hook
|
||||
async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
|
||||
context.log.info(f'Navigating to {context.request.url} ...')
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
if domain in custom_scrapers:
|
||||
wiseflow_logger.debug(f'{url} is a custom scraper, use custom scraper')
|
||||
raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
|
||||
else:
|
||||
run_config = crawl4ai_custom_configs[domain] if domain in crawl4ai_custom_configs else default_crawler_config
|
||||
crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in sites else CacheMode.ENABLED
|
||||
result = await crawler.arun(url=url, crawler_config=run_config, cache_mode=crawl4ai_cache_mode)
|
||||
raw_markdown = result.markdown_v2.raw_markdown
|
||||
metadata_dict = result.metadata
|
||||
media_dict = result.media
|
||||
|
||||
@crawler.router.default_handler
|
||||
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
||||
await context.page.wait_for_load_state('networkidle')
|
||||
await context.page.wait_for_timeout(2000)
|
||||
# Handle dialogs (alerts, confirms, prompts)
|
||||
async def handle_dialog(dialog):
|
||||
context.log.info(f'Closing dialog: {dialog.message}')
|
||||
await dialog.accept()
|
||||
context.page.on('dialog', handle_dialog)
|
||||
web_title = metadata_dict.get('title', '')
|
||||
base_url = metadata_dict.get('base_url', '')
|
||||
if not base_url:
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url.rsplit('/', 1)[0] + '/'
|
||||
|
||||
existing_urls.add(base_url)
|
||||
|
||||
context.log.info('successfully finish fetching')
|
||||
wiseflow_logger.info(context.request.url)
|
||||
|
||||
html = await context.page.inner_html('head')
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
web_title = soup.find('title')
|
||||
if web_title:
|
||||
web_title = web_title.get_text().strip()
|
||||
else:
|
||||
web_title = ''
|
||||
author = metadata_dict.get('author', '')
|
||||
publish_date = extract_and_convert_dates(metadata_dict.get('publish_date', ''))
|
||||
if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
|
||||
author, publish_date = await get_author_and_publish_date(raw_markdown, llm_model)
|
||||
wiseflow_logger.debug(f'get author and publish date by llm: {author}, {publish_date}')
|
||||
|
||||
parsed_url = urlparse(context.request.url)
|
||||
domain = parsed_url.netloc
|
||||
base_tag = soup.find('base', href=True)
|
||||
if base_tag and base_tag.get('href'):
|
||||
base_url = base_tag['href']
|
||||
else:
|
||||
# 如果没有 base 标签,使用当前页面的 URL 路径作为 base url
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
||||
if not base_url.endswith('/'):
|
||||
# 如果路径不以 / 结尾,则去掉最后一个路径段
|
||||
base_url = base_url.rsplit('/', 1)[0] + '/'
|
||||
if not author or author.lower() == 'na':
|
||||
author = parsed_url.netloc
|
||||
|
||||
html = await context.page.inner_html('body')
|
||||
if domain in custom_scrapers:
|
||||
action_dict, link_dict, text = custom_scrapers[domain](html, base_url)
|
||||
else:
|
||||
action_dict, link_dict, text = general_scraper(html, base_url)
|
||||
if not publish_date:
|
||||
publish_date = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
img_dict = media_dict.get('images', [])
|
||||
if not img_dict or not isinstance(img_dict, list):
|
||||
used_img = {}
|
||||
else:
|
||||
used_img = {d['src']: d['alt'] for d in img_dict}
|
||||
|
||||
link_dict, (text, reference_map) = deep_scraper(raw_markdown, base_url, used_img)
|
||||
wiseflow_logger.debug(f'deep scraper get {len(link_dict)} links, {len(reference_map)} references for text')
|
||||
|
||||
is_list, results = await gie(link_dict, text, base_url)
|
||||
to_be_replaces = {}
|
||||
for u, des in link_dict.items():
|
||||
matches = re.findall(img_to_be_recognized_pattern, des)
|
||||
if matches:
|
||||
for img_url in matches:
|
||||
if img_url in recognized_img_cache:
|
||||
link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[img_url])
|
||||
continue
|
||||
link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', img_url)
|
||||
if img_url in to_be_replaces:
|
||||
to_be_replaces[img_url].append(u)
|
||||
else:
|
||||
to_be_replaces[img_url] = [u]
|
||||
matches = re.findall(img_to_be_recognized_pattern, text)
|
||||
if matches:
|
||||
for img_url in matches:
|
||||
if f'h{img_url}' in recognized_img_cache:
|
||||
text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[f'h{img_url}'])
|
||||
continue
|
||||
text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', f'h{img_url}')
|
||||
img_url = f'h{img_url}'
|
||||
if img_url in to_be_replaces:
|
||||
to_be_replaces[img_url].append("content")
|
||||
else:
|
||||
to_be_replaces[img_url] = ["content"]
|
||||
wiseflow_logger.debug(f'total {len(to_be_replaces)} images to be recognized')
|
||||
recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
|
||||
recognized_img_cache.update(recognized_result)
|
||||
for img_url, content in recognized_result.items():
|
||||
for u in to_be_replaces[img_url]:
|
||||
if u == "content":
|
||||
text = text.replace(img_url, content)
|
||||
else:
|
||||
link_dict[u] = link_dict[u].replace(img_url, content)
|
||||
|
||||
more_urls, infos = await gie(link_dict, text, reference_map, author, publish_date)
|
||||
wiseflow_logger.debug(f'get {len(more_urls)} more urls and {len(infos)} infos')
|
||||
if more_urls:
|
||||
working_list.update(more_urls - existing_urls)
|
||||
if infos:
|
||||
await save_to_pb(url, web_title, infos)
|
||||
|
||||
if is_list and results:
|
||||
new_urls = [url for url in results if url != base_url and
|
||||
url != context.request.url and
|
||||
url not in existing_urls]
|
||||
if new_urls:
|
||||
await context.add_requests(new_urls)
|
||||
existing_urls.update(new_urls)
|
||||
return
|
||||
|
||||
if results:
|
||||
await save_to_pb(context.request.url, web_title, results)
|
||||
|
||||
# todo: use llm to determine next action
|
||||
"""
|
||||
screenshot_file_name = f"{hashlib.sha256(context.request.url.encode()).hexdigest()}.png"
|
||||
await context.page.screenshot(path=os.path.join(screenshot_dir, screenshot_file_name), full_page=True)
|
||||
wiseflow_logger.debug(f'screenshot saved to {screenshot_file_name}')
|
||||
"""
|
||||
|
||||
if __name__ == '__main__':
|
||||
sites = pb.read('sites', filter='activated=True')
|
||||
wiseflow_logger.info('execute all sites one time')
|
||||
async def run_all_sites():
|
||||
await crawler.run([site['url'].rstrip('/') for site in sites])
|
||||
|
||||
asyncio.run(run_all_sites())
|
||||
asyncio.run(main_process(set([site['url'] for site in sites])))
|
||||
|
@ -2,7 +2,7 @@ openai
|
||||
loguru
|
||||
pocketbase
|
||||
pydantic
|
||||
json_repair==0.*
|
||||
#json_repair==0.*
|
||||
beautifulsoup4
|
||||
requests
|
||||
crawl4ai==0.4.245
|
@ -1,5 +1,5 @@
|
||||
import asyncio
|
||||
from general_process import crawler, pb, wiseflow_logger
|
||||
from general_process import main_process, pb, wiseflow_logger
|
||||
|
||||
counter = 1
|
||||
|
||||
@ -18,7 +18,7 @@ async def schedule_pipeline(interval):
|
||||
todo_urls.add(site['url'].rstrip('/'))
|
||||
|
||||
counter += 1
|
||||
await crawler.run(list(todo_urls))
|
||||
await main_process(todo_urls)
|
||||
wiseflow_logger.info(f'task execute loop finished, work after {interval} seconds')
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
@ -5,9 +5,7 @@
|
||||
# Currently this script only handles images and links, other elements like downloads and videos are not processed yet, todo: process according to media list
|
||||
# action_dict needs to be extracted from raw html, which is not covered by this script
|
||||
|
||||
import os, re
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
|
||||
@ -194,8 +192,9 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
|
||||
# 处理图片标记 ![alt](src)
|
||||
img_pattern = r'(!\[.*?\]\(.*?\))'
|
||||
matches = re.findall(img_pattern, html_text)
|
||||
text_link_map = {}
|
||||
for match in matches:
|
||||
src = re.search(r'!\[.*?\]\((.*?)\)', match).group(1)
|
||||
src = re.search(r'!\[.*?]\((.*?)\)', match).group(1)
|
||||
if src not in used_img:
|
||||
html_text = html_text.replace(match, '')
|
||||
continue
|
||||
@ -205,23 +204,26 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
|
||||
if not src or src.startswith('#'):
|
||||
html_text = html_text.replace(match, alt)
|
||||
continue
|
||||
|
||||
key = f"Ref_{len(text_link_map)+1}"
|
||||
text_link_map[key] = src
|
||||
src = normalize_url(src, base_url)
|
||||
|
||||
if not src:
|
||||
html_text = html_text.replace(match, alt)
|
||||
html_text = html_text.replace(match, f"{alt}[{key}]")
|
||||
continue
|
||||
|
||||
if any(src.endswith(tld) or src.endswith(tld + '/') for tld in common_tlds):
|
||||
html_text = html_text.replace(match, alt)
|
||||
html_text = html_text.replace(match, f"{alt}[{key}]")
|
||||
continue
|
||||
if any(src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
|
||||
html_text = html_text.replace(match, alt)
|
||||
html_text = html_text.replace(match, f"{alt}[{key}]")
|
||||
continue
|
||||
html_text = html_text.replace(match, f" {alt}§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern
|
||||
html_text = html_text.replace(match, f" {alt}[{key}]§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern
|
||||
|
||||
# 接下来要处理所有的[]()文本了
|
||||
link_pattern = r'\[(.*?)\]\((.*?)\)'
|
||||
matches = re.findall(link_pattern, html_text)
|
||||
text_link_map = {}
|
||||
for match in matches:
|
||||
link_text, link_url = match
|
||||
original_markdown = f'[{link_text}]({link_url})' # 重建原始的 markdown 链接格式
|
||||
|
@ -13,7 +13,6 @@ sites = [
|
||||
save_dir = 'webpage_samples'
|
||||
|
||||
async def main(sites: list):
|
||||
failed_record = []
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
exclude_social_media_links=True,
|
||||
@ -27,9 +26,6 @@ async def main(sites: list):
|
||||
# 排除自动爬虫
|
||||
# 排除已经爬过的
|
||||
result = await crawler.arun(url=site, crawler_config=config, cache_mode=CacheMode.BYPASS)
|
||||
if not result.success:
|
||||
failed_record.append(site)
|
||||
continue
|
||||
|
||||
record_file = os.path.join(save_dir, f"{hashlib.sha256(site.encode()).hexdigest()[-6:]}.json")
|
||||
with open(record_file, 'w', encoding='utf-8') as f:
|
||||
|
@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
|
||||
from datetime import timedelta
|
||||
|
@ -6,7 +6,7 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir) # 获取父目录
|
||||
sys.path.append(project_root)
|
||||
|
||||
from core.utils.deep_scraper import deep_scraper, common_chars, common_file_exts, common_tlds
|
||||
from core.utils.deep_scraper import deep_scraper, common_chars
|
||||
|
||||
def check_url_text(text):
|
||||
print(f"processing: {text}")
|
||||
|
@ -1,11 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, re, sys
|
||||
import json
|
||||
import asyncio
|
||||
import time
|
||||
from prompts import *
|
||||
import json_repair
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir) # get parent dir
|
||||
|
Loading…
Reference in New Issue
Block a user