diff --git a/README.md b/README.md index 25ad443..a36f7bc 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,7 @@ siliconflow(硅基流动)提供大部分主流开源模型的在线 MaaS 服 export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://api.siliconflow.cn/v1" export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct" +export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" export VL_MODEL="OpenGVLab/InternVL2-26B" ``` @@ -129,6 +130,7 @@ export VL_MODEL="OpenGVLab/InternVL2-26B" export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://aihubmix.com/v1" # 具体参考 https://doc.aihubmix.com/ export PRIMARY_MODEL="gpt-4o" +export SECONDARY_MODEL="gpt-4o-mini" export VL_MODEL="gpt-4o" ``` diff --git a/README_EN.md b/README_EN.md index 7195e92..6921d16 100644 --- a/README_EN.md +++ b/README_EN.md @@ -114,6 +114,7 @@ Siliconflow provides online MaaS services for most mainstream open-source models export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://api.siliconflow.cn/v1" export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct" +export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" export VL_MODEL="OpenGVLab/InternVL2-26B" ``` @@ -129,6 +130,7 @@ When using AiHubMix models, the .env configuration can refer to the following: export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://aihubmix.com/v1" # refer to https://doc.aihubmix.com/ export PRIMARY_MODEL="gpt-4o" +export SECONDARY_MODEL="gpt-4o-mini" export VL_MODEL="gpt-4o" ``` diff --git a/README_JP.md b/README_JP.md index 7f932ea..432264f 100644 --- a/README_JP.md +++ b/README_JP.md @@ -114,6 +114,7 @@ Siliconflowは、主流のオープンソースモデルのほとんどにオン export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://api.siliconflow.cn/v1" export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct" +export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" export VL_MODEL="OpenGVLab/InternVL2-26B" ``` @@ -129,6 +130,7 @@ AiHubMixモデルを使用する場合、.envの設定は以下を参考にし export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://aihubmix.com/v1" # referhttps://doc.aihubmix.com/ export PRIMARY_MODEL="gpt-4o" +export SECONDARY_MODEL="gpt-4o-mini" export VL_MODEL="gpt-4o" ``` 😄 [AiHubMixの紹介リンク](https://aihubmix.com?aff=Gp54)からご登録いただけますと幸いです 🌹 diff --git a/README_KR.md b/README_KR.md index 61f056e..a714e17 100644 --- a/README_KR.md +++ b/README_KR.md @@ -114,6 +114,7 @@ Siliconflow는 대부분의 주류 오픈소스 모델에 대한 온라인 MaaS export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://api.siliconflow.cn/v1" export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct" +export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" export VL_MODEL="OpenGVLab/InternVL2-26B" ``` @@ -129,6 +130,7 @@ AiHubMix 모델을 사용할 때 .env 구성은 다음을 참조할 수 있습 export LLM_API_KEY=Your_API_KEY export LLM_API_BASE="https://aihubmix.com/v1" # refer https://doc.aihubmix.com/ export PRIMARY_MODEL="gpt-4o" +export SECONDARY_MODEL="gpt-4o-mini" export VL_MODEL="gpt-4o" ``` diff --git a/core/scrapers/action_dict_scraper.py b/core/agents/action_dict_scraper.py similarity index 100% rename from core/scrapers/action_dict_scraper.py rename to core/agents/action_dict_scraper.py diff --git a/core/agents/get_info.py b/core/agents/get_info.py index 7d45585..9974749 100644 --- a/core/agents/get_info.py +++ b/core/agents/get_info.py @@ -1,15 +1,222 @@ # -*- coding: utf-8 -*- import asyncio - from loguru import logger import os, re -from utils.pb_api import PbTalker from llms.openai_wrapper import openai_llm as llm # from core.llms.siliconflow_wrapper import sfa_llm # or other llm wrapper -from utils.general_utils import is_chinese, extract_and_convert_dates +from utils.general_utils import is_chinese, extract_and_convert_dates, normalize_url +from .get_info_prompts import * -async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]: +common_file_exts = [ + 'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8', + 'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm', + 'zip', 'rar', '7z', 'tar', 'gz', 'bz2', + 'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx', + 'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp' +] +common_tlds = [ + '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co', + '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev', + '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site', + '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work' +] + +async def pre_process(raw_markdown: str, base_url: str, used_img: list[str], + recognized_img_cache: dict, existing_urls: set = set(), + test_mode: bool = False) -> tuple[dict, list[str], list[str], dict]: + + link_dict = {} + + # for special url formate from crawl4ai 0.4.247 + raw_markdown = re.sub(r'', '', raw_markdown).strip() + + # 处理图片标记 ![alt](src) + i_pattern = r'(!\[(.*?)\]\((.*?)\))' + matches = re.findall(i_pattern, raw_markdown, re.DOTALL) + for _sec, alt, src in matches: + # 替换为新格式 §alt||src§ + raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1) + + async def check_url_text(text) -> tuple[int, str]: + score = 0 + _valid_len = len(text.strip()) + # 找到所有[part0](part1)格式的片段 + link_pattern = r'(\[(.*?)\]\((.*?)\))' + matches = re.findall(link_pattern, text, re.DOTALL) + for _sec, link_text, link_url in matches: + # 处理 \"***\" 格式的片段 + quote_pattern = r'\"(.*?)\"' + # 提取所有引号包裹的内容 + _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL)) + _title = _title.strip() + link_text = link_text.strip() + if _title and _title not in link_text: + link_text = f"{_title} - {link_text}" + + real_url_pattern = r'<(.*?)>' + real_url = re.search(real_url_pattern, link_url, re.DOTALL) + if real_url: + _url = real_url.group(1).strip() + else: + _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip() + + if not _url or _url.startswith(('#', 'javascript:')): + text = text.replace(_sec, link_text, 1) + continue + score += 1 + _valid_len = _valid_len - len(_sec) + url = normalize_url(_url, base_url) + + # 分离§§内的内容和后面的内容 + img_marker_pattern = r'§(.*?)\|\|(.*?)§' + inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL) + for alt, src in inner_matches: + link_text = link_text.replace(f'§{alt}||{src}§', '') + + if not link_text and inner_matches: + img_alt = inner_matches[0][0].strip() + img_src = inner_matches[0][1].strip() + if img_src and not img_src.startswith('#'): + img_src = normalize_url(img_src, base_url) + if not img_src: + link_text = img_alt + elif len(img_alt) > 2 or url in existing_urls: + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + link_text = img_alt + _key + elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + link_text = img_alt + _key + elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + link_text = img_alt + _key + else: + if img_src not in recognized_img_cache: + recognized_img_cache[img_src] = await extract_info_from_img(img_src) + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + link_text = recognized_img_cache[img_src] + _key + else: + link_text = img_alt + + _key = f"[{len(link_dict)+1}]" + link_dict[_key] = url + text = text.replace(_sec, link_text + _key, 1) + + # 处理文本中的其他图片标记 + img_pattern = r'(§(.*?)\|\|(.*?)§)' + matches = re.findall(img_pattern, text, re.DOTALL) + remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip() + remained_text_len = len(remained_text) + for _sec, alt, src in matches: + if not src or src.startswith('#') or src not in used_img: + text = text.replace(_sec, alt, 1) + continue + img_src = normalize_url(src, base_url) + if not img_src: + text = text.replace(_sec, alt, 1) + elif remained_text_len > 5 or len(alt) > 2: + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + text = text.replace(_sec, alt + _key, 1) + elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + text = text.replace(_sec, alt + _key, 1) + elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + text = text.replace(_sec, alt + _key, 1) + else: + if img_src not in recognized_img_cache: + recognized_img_cache[img_src] = await extract_info_from_img(img_src) + _key = f"[img{len(link_dict)+1}]" + link_dict[_key] = img_src + text = text.replace(_sec, recognized_img_cache[img_src] + _key, 1) + # 处理文本中的"野 url" + url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])' + matches = re.findall(url_pattern, text) + for url in matches: + url = normalize_url(url, base_url) + _key = f"[{len(link_dict)+1}]" + link_dict[_key] = url + text = text.replace(url, _key, 1) + score += 1 + _valid_len = _valid_len - len(url) + # 统计换行符数量 + newline_count = text.count(' * ') + score += newline_count + ratio = _valid_len/score if score != 0 else 999 + + return ratio, text + + sections = raw_markdown.split('# ') # use '# ' to avoid # in url + if len(sections) > 2: + _sec = sections[0] + section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() + section_remain_len = len(section_remain) + total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL)) + ratio = total_links / section_remain_len if section_remain_len != 0 else 1 + if ratio > 0.05: + if test_mode: + print('this is a navigation section, will be removed') + print(ratio) + print(section_remain) + print('-' * 50) + sections = sections[1:] + _sec = sections[-1] + section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() + section_remain_len = len(section_remain) + if section_remain_len < 198: + if test_mode: + print('this is a footer section, will be removed') + print(section_remain_len) + print(section_remain) + print('-' * 50) + sections = sections[:-1] + + links_parts = [] + contents = [] + for section in sections: + ratio, text = await check_url_text(section) + if ratio < 70: + if test_mode: + print('this is a links part') + print(ratio) + print(text) + print('-' * 50) + links_parts.append(text) + else: + if test_mode: + print('this is a content part') + print(ratio) + print(text) + print('-' * 50) + contents.append(text) + return link_dict, links_parts, contents, recognized_img_cache + + +vl_model = os.environ.get("VL_MODEL", "") +if not vl_model: + print("VL_MODEL not set, will skip extracting info from img, some info may be lost!") + + +async def extract_info_from_img(url: str) -> str: + if not vl_model: + return '§to_be_recognized_by_visual_llm§' + + llm_output = await llm([{"role": "user", + "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}}, + {"type": "text", "text": "提取图片中的所有文字,如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等,则输出NA。注意请仅输出提取出的文字,不要输出别的任何内容。"}]}], + model=vl_model) + + return llm_output + + +async def get_author_and_publish_date(text: str, model: str, test_mode: bool = False, _logger: logger = None) -> tuple[str, str]: if not text: return "", "" @@ -19,245 +226,122 @@ async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]: if len(text) > 2048: text = f'{text[:2048]}......' - system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA" - suffix = '''Please output the extracted information in the following format(output only the result, no other content): -"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""''' - - content = f'\n{text}\n\n\n{suffix}' - llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}], - model=model, max_tokens=50, temperature=0.1) - + content = f'\n{text}\n\n\n{get_ap_suffix}' + llm_output = await llm([{'role': 'system', 'content': get_ap_system}, {'role': 'user', 'content': content}], + model=model, max_tokens=50, temperature=0.1) + if test_mode: + print(f"llm output:\n {llm_output}") ap_ = llm_output.strip().strip('"').strip('//') if '//' not in ap_: - print(f"failed to parse from llm output: {ap_}") + if _logger: + _logger.warning(f"failed to parse from llm output: {ap_}") return '', '' ap = ap_.split('//') - return ap[0], extract_and_convert_dates(ap[1]) -async def extract_info_from_img(task: list, vl_model: str) -> dict: - cache = {} - for url in task: - llm_output = await llm([{"role": "user", - "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}}, - {"type": "text", "text": "提取图片中的所有文字,如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等,则输出NA。注意请仅输出提取出的文字,不要输出别的任何内容。"}]}], - model=vl_model) - - cache[url] = llm_output - return cache - - -class GeneralInfoExtractor: - def __init__(self, pb: PbTalker, _logger: logger) -> None: - self.pb = pb - self.logger = _logger - self.model = os.environ.get("PRIMARY_MODEL", "") - - if not self.model: - self.logger.error("PRIMARY_MODEL not set, can't continue") - raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env") - - # collect tags user set in pb database and determin the system prompt language based on tags - focus_data = pb.read(collection_name='focus_points', filter=f'activated=True') - if not focus_data: - self.logger.info('no activated tag found, will ask user to create one') - focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.' - 'so please input one now. describe what info you care about shortly: ') - explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ') - focus_data.append({"focuspoint": focus, "explanation": explanation, - "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})}) - - # self.focus_list = [item["focuspoint"] for item in focus_data] - self.focus_dict = {item["focuspoint"]: item["id"] for item in focus_data} - focus_statement = '' - for item in focus_data: - tag = item["focuspoint"] - expl = item["explanation"] - focus_statement = f"{focus_statement}//{tag}//\n" - if expl: - if is_chinese(expl): - focus_statement = f"{focus_statement}解释:{expl}\n" - else: - focus_statement = f"{focus_statement}Explanation: {expl}\n" - - if is_chinese(focus_statement): - self.get_info_prompt = f'''你将被给到一段使用标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下: - -{focus_statement}\n -在提炼摘要时,请遵循以下原则: -- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围 -- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文) -- 摘要信息务必忠于原文''' - - self.get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: -""" -//关注点1// -摘要1 -//关注点2// -摘要2 -//关注点3// -NA -... -"""''' - self.get_more_link_prompt = f'''你将被给到数行格式为"<编号>//内容//"的文本,你的任务是逐条分析这些文本,并分别与如下关注点之一相关联。关注点列表及其解释如下: - -{focus_statement}\n -在进行关联分析时,请遵循以下原则: - -- 理解每个关注点的含义 -- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围''' - - self.get_more_link_suffix = '''请分行逐条输出结果,每一条的输出格式为"<编号>//关注点名称//",如果某条内容不与任何关注点相关,请输出"<编号>//NA//"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: -""" -//关注点1名称// -//关注点2名称// -//NA// -... -"""''' - - else: - self.get_info_prompt = f'''You will be given a webpage text wrapped in tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows: - -{focus_statement}\n -When extracting summaries, please follow these principles: -- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any) -- The summary should be detailed and comprehensive -- The summary should be faithful to the original text''' - - self.get_info_suffix = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format: -""" -//Focus Point 1// -Summary 1 -//Focus Point 2// -Summary 2 -//Focus Point 3// -NA -... -"""''' - - self.get_more_link_prompt = f'''You will be given several lines of text in the format "//content//". Your task is to analyze each line and associate it with one of the following focus points. The list of focus points and their explanations are as follows: - -{focus_statement}\n -When performing the association analysis, please follow these principles: - -- Understand the meaning of each focus point -- If a focus point has further explanation, ensure the extracted content aligns with the scope of these explanations''' - - self.get_more_link_suffix = '''Please output the results line by line. Each line should be in the format "//focus point name//". If a line is not related to any focus point, output "//NA//". The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format: -""" -//Focus Point 1// -//Focus Point 2// -//NA// -... -"""''' - - async def _generate_results(self, lines: list, mode: str) -> set: - if mode == 'get_info': - system_prompt = self.get_info_prompt - suffix = self.get_info_suffix - batch_size = 5000 - elif mode == 'get_link': - system_prompt = self.get_more_link_prompt - suffix = self.get_more_link_suffix - batch_size = 2048 - else: - self.logger.error(f"unknown mode: {mode}") - return set() - - cache = set() - batches = [] - text_batch = '' - for line in lines: - text_batch += f'{line}\n' - if len(text_batch) > batch_size: - content = f'\n{text_batch}\n\n{suffix}' - batches.append({'system_prompt': system_prompt, 'content': content}) - text_batch = '' - - if text_batch: +async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list[str], test_mode: bool = False, + _logger: logger = None) -> set: + + sys_prompt, suffix, model = prompts + text_batch = '' + cache = set() + while texts: + t = texts.pop(0) + text_batch = f'{text_batch}{t}\n\n' + if len(text_batch) > 2048 or len(texts) == 0: content = f'\n{text_batch}\n\n{suffix}' - batches.append({'system_prompt': system_prompt, 'content': content}) + result = await llm( + [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], + model=model, temperature=0.1) - self.logger.info(f"LLM tasks size: {len(batches)}") - tasks = [ - llm( - [{'role': 'system', 'content': batch['system_prompt']}, {'role': 'user', 'content': batch['content']}], - model=self.model, temperature=0.1 - ) - for batch in batches] - results = await asyncio.gather(*tasks) - for res in results: - if res: - extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL) - if extracted_result: - cache.add(extracted_result[-1]) + result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) + if test_mode: + print(f"llm output:\n {result}") + if result: + links = re.findall(r'\[\d+\]', result[-1]) + for link in links: + if link not in text_batch: + if _logger: + _logger.warning(f"model generating hallucination:\n{result[-1]}") + if test_mode: + print(f"model hallucination:\n{result[-1]}") + continue + cache.add(link) + text_batch = '' - return cache + more_urls = set() + for mark in cache: + url = link_dict[mark] + has_common_ext = any(url.endswith(ext) for ext in common_file_exts) + has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds) + if has_common_ext or has_common_tld: + continue + more_urls.add(url) + + return more_urls + - async def get_more_related_urls(self, link_dict: dict) -> set: - _to_be_processed = [] - link_map = {} - for i, (url, des) in enumerate(link_dict.items()): - des = des.replace('\n', ' ') - _to_be_processed.append(f'//{des}//') - link_map[f' list[dict]: - raw_result = await self._generate_results(_to_be_processed, 'get_link') - final_result = set() - for result in raw_result: - for item in result.split('\n'): - if not item: - continue - segs = item.split('>') - if len(segs) != 2: - self.logger.debug(f"bad generate result: {item}") - continue - _index, focus = segs - _index = _index.strip() - focus = focus.strip().strip('//') - if focus == 'NA': - continue - if focus not in self.focus_dict or _index not in link_map: - self.logger.debug(f"bad generate result: {item}") - continue - # self.logger.debug(f"{link_map[_index]} selected") - final_result.add(link_map[_index]) - return final_result + sys_prompt, suffix, model = prompts - async def get_info(self, text: str, text_links: dict, info_pre_fix: str) -> list[dict]: - raw_result = await self._generate_results(text.split('\n'), 'get_info') - final = [] - for item in raw_result: - self.logger.debug(f"llm output:\n{item}") - segs = item.split('//') - i = 0 - while i < len(segs) - 1: - focus = segs[i].strip() - if not focus: - i += 1 - continue - if focus not in self.focus_dict: - self.logger.debug(f"bad generate result: {item}") - i += 1 - continue - content = segs[i+1].strip().strip('摘要').strip(':').strip(':') - i += 2 - if not content or content == 'NA': - continue - """ - maybe can use embedding retrieval to judge - """ + if test_mode: + info_pre_fix = '' + else: + info_pre_fix = f"//{author} {publish_date}//" - url_tags = re.findall(r'\[(Ref_\d+)]', content) - refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links} + cache = set() + batches = [] + text_batch = '' + while texts: + t = texts.pop(0) + text_batch = f'{text_batch}{t}# ' + if len(text_batch) > 9999 or len(texts) == 0: + content = f'\n{text_batch}\n\n{suffix}' + batches.append(content) + text_batch = '' - final.append({'tag': self.focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences}) + tasks = [ + llm([{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], model=model, temperature=0.1) + for content in batches] + results = await asyncio.gather(*tasks) + + for res in results: + if test_mode: + print(f"llm output:\n {res}") + extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL) + if extracted_result: + cache.add(extracted_result[-1]) + + final = [] + for item in cache: + segs = item.split('//') + i = 0 + while i < len(segs) - 1: + focus = segs[i].strip() + if not focus: + i += 1 + continue + if focus not in focus_dict: + if _logger: + _logger.info(f"llm hallucination: {item}") + if test_mode: + print(f"llm hallucination: {item}") + i += 1 + continue + content = segs[i+1].strip().strip('摘要').strip(':').strip(':') + i += 2 + if not content or content == 'NA': + continue + """ + maybe can use embedding retrieval to judge + """ + url_tags = re.findall(r'\[\d+\]', content) + refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict} + final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences}) - return final - - async def __call__(self, link_dict: dict, text: str, text_links: dict, author: str, publish_date: str) -> tuple[set, list]: - info_prefix = f"//{author} {publish_date}//" - return await self.get_more_related_urls(link_dict), await self.get_info(text, text_links, info_prefix) + return final diff --git a/core/agents/get_info_prompts.py b/core/agents/get_info_prompts.py new file mode 100644 index 0000000..f4c917b --- /dev/null +++ b/core/agents/get_info_prompts.py @@ -0,0 +1,74 @@ + +get_link_system = '''你将被给到一段使用标签包裹的网页文本,你的任务是从前到后仔细阅读文本,提取出与如下任一关注点相关的原文片段。关注点及其解释如下: + +{focus_statement}\n +在进行提取时,请遵循以下原则: +- 理解关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围 +- 在满足上面原则的前提下,提取出全部可能相关的片段 +- 提取出的原文片段务必保留类似"[3]"这样的引用标记,后续的处理需要用到这些引用标记''' + +get_link_suffix = '''请逐条输出提取的原文片段,并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。 +如下是输出格式示例:: +""" +原文片段1 +原文片段2 +... +"""''' + +get_link_system_en = '''You will be given a webpage text wrapped in tags. Your task is to carefully read the text from beginning to end, extracting fragments related to any of the following focus points. The focus points and their explanations are as follows: + +{focus_statement}\n +When extracting fragments, please follow these principles: +- Understand the meaning of each focus point and its explanation (if any), ensure the extracted content strongly relates to the focus point and aligns with the explanation (if any) +- Extract all possible related fragments +- Ensure the extracted fragments retain the reference markers like "[3]", as these will be used in subsequent processing''' + +get_link_suffix_en = '''Please output each extracted fragment one by one, and wrap the entire output in triple quotes. The triple quotes should contain only the extracted fragments, with no other content. If the text does not contain any content related to the focus points, keep the triple quotes empty. +Here is an example of the output format: +""" +Fragment 1 +Fragment 2 +... +"""''' + +get_info_system = '''你将被给到一段使用标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下: + +{focus_statement}\n +在提炼摘要时,请遵循以下原则: +- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围 +- 摘要应当详实、充分,使用简体中文(如果原文是英文,请翻译成简体中文) +- 摘要信息务必忠于原文''' + +get_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: +""" +//关注点1// +摘要1 +//关注点2// +摘要2 +//关注点3// +NA +... +"""''' + +get_info_system_en = '''You will be given a webpage text wrapped in tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows: + +{focus_statement}\n +When extracting summaries, please follow these principles: +- Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any) +- The summary should be detailed and comprehensive +- The summary should be faithful to the original text''' + +get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format: +""" +//Focus Point 1// +Summary 1 +//Focus Point 2// +Summary 2 +//Focus Point 3// +NA +... +"""''' + +get_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA" +get_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content): +"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""''' diff --git a/core/general_process.py b/core/general_process.py index 372b0c1..58f0957 100644 --- a/core/general_process.py +++ b/core/general_process.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- from utils.pb_api import PbTalker -from utils.general_utils import get_logger, extract_and_convert_dates -from utils.deep_scraper import * +from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese from agents.get_info import * import json import asyncio -from custom_fetchings import * +from scrapers import * from urllib.parse import urlparse from crawl4ai import AsyncWebCrawler, CacheMode from datetime import datetime, timedelta @@ -19,18 +18,14 @@ if project_dir: wiseflow_logger = get_logger('general_process', project_dir) pb = PbTalker(wiseflow_logger) -gie = GeneralInfoExtractor(pb, wiseflow_logger) one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")} -llm_model = os.environ.get("PRIMARY_MODEL", "") -vl_model = os.environ.get("VL_MODEL", "") -if not vl_model: - wiseflow_logger.warning("VL_MODEL not set, will skip extracting info from img, some info may be lost!") - -img_to_be_recognized_pattern = r'§to_be_recognized_by_visual_llm_(.*?)§' -recognized_img_cache = {} - +crawler = AsyncWebCrawler(verbose=False) +model = os.environ.get("PRIMARY_MODEL", "") +if not model: + raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env") +secondary_model = os.environ.get("SECONDARY_MODEL", model) async def save_to_pb(url: str, url_title: str, infos: list): # saving to pb process @@ -46,112 +41,142 @@ async def save_to_pb(url: str, url_title: str, infos: list): async def main_process(_sites: set | list): + # collect tags user set in pb database and determin the system prompt language based on tags + focus_data = pb.read(collection_name='focus_points', filter=f'activated=True') + if not focus_data: + wiseflow_logger.info('no activated tag found, will ask user to create one') + focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.' + 'so please input one now. describe what info you care about shortly: ') + explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ') + focus_data.append({"focuspoint": focus, "explanation": explanation, + "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})}) + + + focus_dict = {item["focuspoint"]: item["id"] for item in focus_data} + focus_statement = '' + for item in focus_data: + tag = item["focuspoint"] + expl = item["explanation"] + focus_statement = f"{focus_statement}//{tag}//\n" + if expl: + if is_chinese(expl): + focus_statement = f"{focus_statement}解释:{expl}\n" + else: + focus_statement = f"{focus_statement}Explanation: {expl}\n" + + date_stamp = datetime.now().strftime('%Y-%m-%d') + if is_chinese(focus_statement): + get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"今天的日期是{date_stamp},{get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix + get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"今天的日期是{date_stamp},{get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix + else: + get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix_en + get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix_en + + recognized_img_cache = {} working_list = set() working_list.update(_sites) - async with AsyncWebCrawler(headless=True, verbose=False) as crawler: - while working_list: - url = working_list.pop() - existing_urls.add(url) - has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) - if has_common_ext: - wiseflow_logger.info(f'{url} is a common file, skip') - continue + await crawler.start() + while working_list: + url = working_list.pop() + existing_urls.add(url) + has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts) + if has_common_ext: + wiseflow_logger.info(f'{url} is a common file, skip') + continue - parsed_url = urlparse(url) - existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}") - existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/") - domain = parsed_url.netloc - if domain in custom_scrapers: - wiseflow_logger.debug(f'{url} is a custom scraper, use custom scraper') - raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url) - else: - crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED - result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit', - magic=True, scan_full_page=True, - cache_mode=crawl4ai_cache_mode) - if not result.success: - wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') - continue + parsed_url = urlparse(url) + existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}") + existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/") + domain = parsed_url.netloc + if domain in custom_fetching_configs: + wiseflow_logger.debug(f'{url} will using custom crawl4ai run config') + run_config = custom_fetching_configs[domain] + else: + run_config = crawler_config + + run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED + result = await crawler.arun(url=url, config=run_config) + if not result.success: + wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip') + continue + metadata_dict = result.metadata if result.metadata else {} - raw_markdown = result.markdown - if not raw_markdown: - wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') - continue - metadata_dict = result.metadata if result.metadata else {} - media_dict = result.media if result.media else {} + if domain in custom_scrapers: + result = custom_scrapers[domain](result) + raw_markdown = result.content + used_img = result.images + title = result.title + base_url = result.base + author = result.author + publish_date = result.publish_date + else: + raw_markdown = result.markdown + media_dict = result.media if result.media else {} + used_img = [d['src'] for d in media_dict.get('images', [])] + title = '' + base_url = '' + author = '' + publish_date = '' - web_title = metadata_dict.get('title', '') + if not raw_markdown: + wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip') + continue + + if not title: + title = metadata_dict.get('title', '') + if not base_url: base_url = metadata_dict.get('base', '') - if not base_url: - base_url = url - + if not base_url: + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + if not base_url.endswith('/'): + # 如果路径不以 / 结尾,则去掉最后一个路径段 + base_url = base_url.rsplit('/', 1)[0] + '/' + + if not author: author = metadata_dict.get('author', '') - publish_date = extract_and_convert_dates(metadata_dict.get('publish_date', '')) + if not publish_date: + publish_date = metadata_dict.get('publish_date', '') - img_dict = media_dict.get('images', []) - if not img_dict or not isinstance(img_dict, list): - used_img = [] - else: - used_img = [d['src'] for d in img_dict] + link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls) - link_dict, (text, reference_map) = deep_scraper(raw_markdown, base_url, used_img) - _duplicate_url = set(link_dict.keys()) & existing_urls - for _d in _duplicate_url: - del link_dict[_d] + if link_dict and links_parts: + prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model] + links_texts = [] + for _parts in links_parts: + links_texts.extend(_parts.split('\n\n')) + more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger) + if more_url: + working_list.update(more_url - existing_urls) + + if not contents: + continue - to_be_replaces = {} - for u, des in link_dict.items(): - matches = re.findall(img_to_be_recognized_pattern, des) - if matches: - for img_url in matches: - if img_url in recognized_img_cache: - link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[img_url]) - continue - link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', img_url) - if img_url in to_be_replaces: - to_be_replaces[img_url].append(u) - else: - to_be_replaces[img_url] = [u] - matches = re.findall(img_to_be_recognized_pattern, text) - if matches: - for img_url in matches: - if f'h{img_url}' in recognized_img_cache: - text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[f'h{img_url}']) - continue - text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', f'h{img_url}') - img_url = f'h{img_url}' - if img_url in to_be_replaces: - to_be_replaces[img_url].append("content") - else: - to_be_replaces[img_url] = ["content"] + if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na': + author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger) - recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model) - wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized') - recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()}) - for img_url, content in recognized_result.items(): - for u in to_be_replaces[img_url]: - if u == "content": - text = text.replace(img_url, content) - else: - link_dict[u] = link_dict[u].replace(img_url, content) - - if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na': - author, publish_date = await get_author_and_publish_date(text, llm_model) - wiseflow_logger.debug(f'get author and publish date by llm: {author}, {publish_date}') - if not author or author.lower() == 'na': - author = parsed_url.netloc - if not publish_date: - publish_date = datetime.now().strftime('%Y-%m-%d') - - more_urls, infos = await gie(link_dict, text, reference_map, author, publish_date) - wiseflow_logger.debug(f'get {len(more_urls)} more urls and {len(infos)} infos') - if more_urls: - working_list.update(more_urls - existing_urls) - if infos: - await save_to_pb(url, web_title, infos) + if not author or author.lower() == 'na': + author = parsed_url.netloc + + if publish_date: + publish_date = extract_and_convert_dates(publish_date) + else: + publish_date = date_stamp + prompts = [get_info_sys_prompt, get_info_suffix_prompt, model] + infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger) + if infos: + await save_to_pb(url, title, infos) + await crawler.close() if __name__ == '__main__': sites = pb.read('sites', filter='activated=True') wiseflow_logger.info('execute all sites one time') - asyncio.run(main_process([site['url'] for site in sites])) + asyncio.run(main_process([site['url'].rstrip('/') for site in sites])) diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py index 4b40cf5..75dd403 100644 --- a/core/llms/openai_wrapper.py +++ b/core/llms/openai_wrapper.py @@ -1,6 +1,6 @@ import os from openai import AsyncOpenAI as OpenAI -from openai import RateLimitError +# from openai import RateLimitError import asyncio base_url = os.environ.get('LLM_API_BASE', "") @@ -30,7 +30,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: try: response = await client.chat.completions.create(messages=messages, model=model, **kwargs) resp = response.choices[0].message.content - except RateLimitError as e: + except Exception as e: if logger: logger.warning(f'{e}\nRetrying in 60 second...') else: @@ -44,13 +44,6 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: logger.error(f'after many try, llm error: {response}') else: print(f'after many try, llm error: {response}') - - except Exception as e: - if logger: - logger.error(f'openai_llm error: {e}') - else: - print(f'openai_llm error: {e}') - finally: semaphore.release() diff --git a/core/scrapers/README.md b/core/scrapers/README.md index 090aa76..94d7d1c 100644 --- a/core/scrapers/README.md +++ b/core/scrapers/README.md @@ -1,10 +1,10 @@ -## 配置自定义 Crawl4ai 抓取 config +# 配置自定义 Crawl4ai 抓取 config 如果信源需要对应特殊的抓取配置,可以在 `core/scrapers/__init__.py` 中编辑对应的 crawler_config,并在 `custom_fetching_configs` 中注册。 -## 解析器(Scraper) +# 解析器(Scraper) -对于从网页内容中提取关注信息这一任务而言,直接把 html 编码送给 llm 并不是一个好主意。在该类型任务中,我们期待 llm 表现的类似人类,侧重点在于内容的理解,而不是 html 的解析。且不说直接送入 html 编码还会造成额外(非常大量)的 token 消耗和处理效率的降低。 +对于从网页内容中提取关注信息这一任务而言,直接把 html 编码送给 llm 并不是一个好主意,这会极大的增加提取任务的复杂度,引入更多干扰,并且产生额外(非常大量)的 token 消耗和处理效率的降低。 将 html 转为易于意思理解的 markdown 是目前领域内比较通用的做法,这方面 Crawl4ai 提供了比较成熟的解决方案。 @@ -12,50 +12,24 @@ 简单的说,解析器的作用就是将 html 编码转为 markdown 文本,并在这个过程中尽量过滤不必要信息(因为后一步是通过 llm 进行提炼,所以这一步要求不高),但也尽可能的保留 html 版面布局信息(这很重要)。 -### deep_scraper +你并不需要通过解析器完成最终的信息提取,这个工作最终还是会使用 llm 完成——甚至在这之前我们还有一个被称为pre-process的步骤,它的主要功能是将待处理的文章 markdown 合理切块并将 url 和图片等进行合理的转化,事实上,这个模块是本项目的一大创新点——解析器只需要提供适合 pre-process 的 markdown(我们称为 raw_markdown)和有价值的图片列表即可。 -我们进一步发现,直接将 markdown 全文送入 llm 解析也存在缺陷。 +## 自定义解析器 -我在这里仅举一个例子: +scraper 输入的 fetch_result 为一个 dict 或者是 crawl4ai 的 CrawlResult 对象,它包含如下字段: -*很多网站喜欢在文章页面底部或者侧边栏加入推荐阅读板块,如果说这些推荐阅读只是链接列表还好,但事实上,很多时候他们还包括内容简介,这些简介的长度并不短,甚至有可能跟页面主体正文长度相当。这个时候如果我们将 markdown 整体扔给 llm,就会发现很难为llm 指定明确的工作策略——如果直接舍弃这些推荐阅读内容(先不说很难指定清晰的舍弃策略),但我们不能保证这里面不包含关注点内容;而如果保留这些内容,那么很可能 llm 就无法聚焦该页面的核心内容。或者 llm 会从这些简介中进行信息提取,但是这些简介对应额外的链接,这些后续的链接也会在后面进行爬取,这就可能带来提取出大量重复信息的情况。* +- url: str, 网页的 url +- html: str, 网页的 html 编码 +- cleaned_html: str, 经过清洗的 html 编码 +- markdown: str, 经过清洗的 markdown 编码 +- media: dict, 包含图片、视频、音频等媒体信息 +- metadata: dict, 包含网页的元数据,如标题、作者、发布时间等 -事实上,这里我们需要做的工作是分块,这有点类似 RAG 系统中的 chunk ,但不同的是,这里我们不需要考虑 chunk 的粒度,而是需要考虑页面布局的粒度。因为我们面对的是 html 页面,而不是 pdf、word…… +scraper 的输出为 ScraperResultData,具体见 `core/scrapers/scraper_data.py`。 -这一点很重要,我们需要按 html 的页面布局进行分块,而不是按语义逻辑分块!因为这影响了后续我们如何判断对不同的块采用合同提取策略。这也就是 wiseflow 为何不使用已有的文档智能工具,而是自写了 deep_scraper 的原因。 +## 注册自定义解析器 -当然,另一个选择是直接使用视觉大模型进行 layout 的识别,但实践中我们也发现,这需要能够获取不受干扰的网页截图,但这个操作会极大增加系统复杂度以及降低处理速度,且效果并不稳定(比如对于页面弹窗的处理……)。 - -另一个不使用文档智能和视觉大模型的原因,是因为相比于 pdf、word 这种完全的非结构数据, html 编码本身就已经包含了全部 layout 信息,转化为 markdown 的过程实际上也保留了这些信息(通过\n # 这些符号),所以直接通过一定的规则对 markdown 进行分块并分别处理是可行的。 - -这就是 wiseflow deep_scraper 的主要功能,归纳起来:1、按布局信息对markdown进行分块;2、分析每个块的类型,并按不同策略进行预处理,便于最终 llm 的提取。 - -### 注册自定义解析器 - -wiseflow 的默认工作流程是: - -*crawl4ai 获取 html,并初步转化为raw_markdown(此过程应用默认的 config) --> deep_scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。* - -如前所言,如果需要为特定信源配置特殊的 crawl4ai 获取策略(包括 raw_markdown 的转化策略),可以在 `core/scrapers/__init__.py` 中注册自定义的crawler_config; - -同时也可以为特定信源配置自定义的 scraper,自定义 scraper 的输入为crawl4ai的fetching_result,输出为将要被送入 llm 进行分析的链接字典和文本块列表。使用自定义 scraper 时,wiseflow 的处理流程为: - -*crawl4ai 获取 html,并初步转化为raw_markdown(此过程应用默认的 config或指定 config) --> 自定义 scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。* - -自定义 scraper 可以内部调用deep_scraper作为后处理流程(如mp_scraper),也可以完全自定义全部流程。 - -scraper 输入的 fetch_result 为一个 dict,格式如下: - - - - - - - - -输出为 ScraperResultData,包含 url、content、links、images 四个字段。 - -在 `core/scrapers/__init__.py` 中注册,参考: +编写好 scraper 后,在 `core/scrapers/__init__.py` 中注册,参考: ```python from .mp import mp_scarper diff --git a/core/scrapers/README_EN.md b/core/scrapers/README_EN.md index e2c6a87..d7160ac 100644 --- a/core/scrapers/README_EN.md +++ b/core/scrapers/README_EN.md @@ -1,15 +1,43 @@ -## Custom Scraper Registration +# Configure Custom Crawl4ai Fetching Config -Register in `core/scrapers/__init__.py`, for example: +If a source requires special fetching configuration, you can edit the corresponding crawler_config in `core/scrapers/__init__.py` and register it in `custom_fetching_configs`. + +# Scraper + +For the task of extracting focused information from web content, directly feeding HTML code to LLM is not a good idea. This would greatly increase the complexity of extraction, introduce more interference, and result in additional (very large) token consumption and reduced processing efficiency. + +Converting HTML to markdown that is easy to understand semantically is a common practice in the field, and Crawl4ai provides a relatively mature solution for this. + +However, this refers to general cases. There is no one-size-fits-all solution. For certain specific sources, Crawl4ai's default parser may not work well, such as WeChat public account articles. In these cases, we need to customize scrapers for the sources. + +Simply put, the scraper's role is to convert HTML code to markdown text, filtering out unnecessary information during this process (since the next step is refinement through LLM, requirements here are not high), while preserving HTML layout information as much as possible (this is important). + +You don't need to complete the final information extraction through the scraper. This work will ultimately be done using LLM - in fact, before that we have a step called pre-process, whose main function is to reasonably segment the article markdown and properly transform URLs and images. In fact, this module is a major innovation point of this project - the scraper only needs to provide raw_markdown suitable for pre-process and a list of valuable images. + +## Custom Scraper + +The fetch_result input to the scraper is either a dict or a Crawl4ai CrawlResult object containing the following fields: + +- url: str, the webpage URL +- html: str, the webpage HTML code +- cleaned_html: str, cleaned HTML code +- markdown: str, cleaned markdown code +- media: dict, contains media information like images, videos, audio etc. +- metadata: dict, contains webpage metadata like title, author, publish time etc. + +The scraper output is ScraperResultData, see details in `core/scrapers/scraper_data.py`. + +## Register Custom Scraper + +After writing the scraper, register it in `core/scrapers/__init__.py`, for example: ```python -from .mp import mp_scarper +from .mp import mp_scraper -customer_scrapers = {'mp.weixin.qq.com': mp_scarper} +custom_scrapers = {'mp.weixin.qq.com': mp_scraper} ``` -Note that the key should use the domain name, which can be obtained using `urllib.parse`: - +Note that the key uses the domain name, which can be obtained using `urllib.parse`: ```python from urllib.parse import urlparse diff --git a/core/scrapers/__init__.py b/core/scrapers/__init__.py index d0a2e99..e1f2aeb 100644 --- a/core/scrapers/__init__.py +++ b/core/scrapers/__init__.py @@ -1,7 +1,8 @@ from crawl4ai import CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from .mp_scraper import mp_scraper -custom_scrapers = {} +custom_scrapers = {'mp.weixin.qq.com': mp_scraper} custom_fetching_configs = {} md_generator = DefaultMarkdownGenerator( diff --git a/core/scrapers/deep_scraper.py b/core/scrapers/deep_scraper.py deleted file mode 100644 index a55bac7..0000000 --- a/core/scrapers/deep_scraper.py +++ /dev/null @@ -1,225 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program requires HTML to be first converted to properly formatted text while preserving link positions and structural information (like crawl4ai's html2text work); -# The complete media list from the webpage needs to be extracted beforehand -# Currently this script only handles images and links, other elements like downloads and videos are not processed yet, todo: process according to media list -# action_dict needs to be extracted from raw html, which is not covered by this script - -import re -from urllib.parse import urljoin - - -common_file_exts = [ - 'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8', - 'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm', - 'zip', 'rar', '7z', 'tar', 'gz', 'bz2', - 'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx', - 'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp' -] -common_tlds = [ - '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co', - '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev', - '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site', - '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work' -] - -common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#' - -def normalize_url(url: str, base_url: str) -> str: - url = url.strip() - if url.startswith(('www.', 'WWW.')): - _url = f"https://{url}" - elif url.startswith('/www.'): - _url = f"https:/{url}" - elif url.startswith("//"): - _url = f"https:{url}" - elif url.startswith(('http://', 'https://')): - _url = url - elif url.startswith('http:/'): - _url = f"http://{url[6:]}" - elif url.startswith('https:/'): - _url = f"https://{url[7:]}" - else: - _url = urljoin(base_url, url) - - _ss = _url.split('//') - if len(_ss) == 2: - return '//'.join(_ss) - else: - return _ss[0] + '//' + '/'.join(_ss[1:]) - - -def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]: - link_dict = {} - to_be_recognized_by_visual_llm = {} - # for special url formate from crawl4ai 0.4.247 - raw_markdown = re.sub(r'', '', raw_markdown).strip() - - # 处理图片标记 ![alt](src) - i_pattern = r'(!\[(.*?)\]\((.*?)\))' - matches = re.findall(i_pattern, raw_markdown, re.DOTALL) - for _sec, alt, src in matches: - # 替换为新格式 §alt||src§ - raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1) - - def check_url_text(text) -> tuple[int, str]: - score = 0 - _valid_len = len(text.strip()) - # 找到所有[part0](part1)格式的片段 - link_pattern = r'(\[(.*?)\]\((.*?)\))' - matches = re.findall(link_pattern, text, re.DOTALL) - for _sec, link_text, link_url in matches: - # 处理 \"***\" 格式的片段 - quote_pattern = r'\"(.*?)\"' - # 提取所有引号包裹的内容 - _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL)) - - # 分离§§内的内容和后面的内容 - img_marker_pattern = r'§(.*?)\|\|(.*?)§' - inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL) - for alt, src in inner_matches: - link_text = link_text.replace(f'§{alt}||{src}§', '') - link_text = link_text.strip() - if _title not in link_text: - link_text = f"{_title} - {link_text}" - - link_text = link_text.strip() - if not link_text and inner_matches: - img_alt = inner_matches[0][0].strip() - img_src = inner_matches[0][1].strip() - if img_src and not img_src.startswith('#'): - img_src = normalize_url(img_src, base_url) - if not img_src: - link_text = img_alt - elif len(img_alt) > 2: - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - link_text = img_alt + _key - elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - link_text = img_alt + _key - elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - link_text = img_alt + _key - else: - if img_src not in to_be_recognized_by_visual_llm: - to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§" - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - link_text = to_be_recognized_by_visual_llm[img_src] + _key - else: - link_text = img_alt - - real_url_pattern = r'<(.*?)>' - real_url = re.search(real_url_pattern, link_url, re.DOTALL) - if real_url: - _url = real_url.group(1).strip() - else: - _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip() - - if not _url or _url.startswith(('#', 'javascript:')): - text = text.replace(_sec, link_text, 1) - continue - score += 1 - _valid_len = _valid_len - len(_sec) - url = normalize_url(_url, base_url) - _key = f"[{len(link_dict)+1}]" - link_dict[_key] = url - text = text.replace(_sec, link_text + _key, 1) - # 检查链接是否是常见文件类型或顶级域名 - # todo: 最后提取是否添加到 more_link时或者主流程时再处理 - """ - has_common_ext = any(url.endswith(ext) for ext in common_file_exts) - has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds) - if has_common_ext or has_common_tld: - continue - """ - # 处理文本中的其他图片标记 - img_pattern = r'(§(.*?)\|\|(.*?)§)' - matches = re.findall(img_pattern, text, re.DOTALL) - remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip() - remained_text_len = len(remained_text) - for _sec, alt, src in matches: - if not src or src.startswith('#') or src not in used_img: - text = text.replace(_sec, alt, 1) - continue - img_src = normalize_url(src, base_url) - if not img_src: - text = text.replace(_sec, alt, 1) - elif remained_text_len > 5 or len(alt) > 2: - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - text = text.replace(_sec, alt + _key, 1) - elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - text = text.replace(_sec, alt + _key, 1) - elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - text = text.replace(_sec, alt + _key, 1) - else: - if img_src not in to_be_recognized_by_visual_llm: - to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§" - _key = f"[img{len(link_dict)+1}]" - link_dict[_key] = img_src - text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1) - # 处理文本中的"野 url" - url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])' - matches = re.findall(url_pattern, text) - for url in matches: - url = normalize_url(url, base_url) - _key = f"[{len(link_dict)+1}]" - link_dict[_key] = url - text = text.replace(url, _key, 1) - score += 1 - _valid_len = _valid_len - len(url) - # 统计换行符数量 - newline_count = text.count(' * ') - score += newline_count - ratio = _valid_len/score if score != 0 else 999 - - return ratio, text - - sections = raw_markdown.split('# ') # use '# ' to avoid # in url - if len(sections) > 2: - _sec = sections[0] - section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() - section_remain_len = len(section_remain) - total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL)) - ratio = total_links / section_remain_len if section_remain_len != 0 else 1 - if ratio > 0.05: - print('this is a navigation section, will be removed') - print(ratio) - print(section_remain) - print('-' * 50) - sections = sections[1:] - _sec = sections[-1] - section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() - section_remain_len = len(section_remain) - if section_remain_len < 198: - print('this is a footer section, will be removed') - print(section_remain_len) - print(section_remain) - print('-' * 50) - sections = sections[:-1] - - links_parts = [] - contents = [] - for section in sections: - ratio, text = check_url_text(section) - if ratio < 70: - print('this is a links part') - print(ratio) - print(text) - print('-' * 50) - links_parts.append(text) - else: - print('this is a content part') - print(ratio) - print(text) - print('-' * 50) - contents.append(text) - return link_dict, links_parts, contents diff --git a/core/utils/general_utils.py b/core/utils/general_utils.py index 0f72771..b355f07 100644 --- a/core/utils/general_utils.py +++ b/core/utils/general_utils.py @@ -1,10 +1,34 @@ -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin import os import re # import jieba from loguru import logger +def normalize_url(url: str, base_url: str) -> str: + url = url.strip() + if url.startswith(('www.', 'WWW.')): + _url = f"https://{url}" + elif url.startswith('/www.'): + _url = f"https:/{url}" + elif url.startswith("//"): + _url = f"https:{url}" + elif url.startswith(('http://', 'https://')): + _url = url + elif url.startswith('http:/'): + _url = f"http://{url[6:]}" + elif url.startswith('https:/'): + _url = f"https://{url[7:]}" + else: + _url = urljoin(base_url, url) + + _ss = _url.split('//') + if len(_ss) == 2: + return '//'.join(_ss) + else: + return _ss[0] + '//' + '/'.join(_ss[1:]) + + def isURL(string): if string.startswith("www."): string = f"https://{string}" diff --git a/env_sample b/env_sample index b5a9e84..a4100ab 100755 --- a/env_sample +++ b/env_sample @@ -1,8 +1,9 @@ export LLM_API_KEY="" export LLM_API_BASE="https://api.siliconflow.cn/v1" export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct" -#If your source pages are relatively simple with small amounts of information per page, considering cost and time (mainly time), Qwen2.5-32B-Instruct is recommended -#If your source pages contain more links, have complex layouts, and you don't want to miss any information, DeepSeek-V2.5 is recommended +export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" +#use a secondary model to excute the filtering task for the cost saving +#if not set, will use the primary model to excute the filtering task export VL_MODEL="OpenGVLab/InternVL2-26B" export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password diff --git a/test/get_info_test.py b/test/get_info_test.py index 7defab8..dc0c8a2 100644 --- a/test/get_info_test.py +++ b/test/get_info_test.py @@ -1,92 +1,90 @@ # -*- coding: utf-8 -*- -import os, re, sys +import os, sys import json import asyncio import time -from prompts import * from datetime import datetime -current_dir = os.path.dirname(os.path.abspath(__file__)) -project_root = os.path.dirname(current_dir) # get parent dir -sys.path.append(project_root) +# 将core目录添加到Python路径 +core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core') +sys.path.append(core_path) -from core.llms.openai_wrapper import openai_llm as llm +# 现在可以直接导入模块,因为core目录已经在Python路径中 +from scrapers import * +from agents.get_info import pre_process + +from utils.general_utils import is_chinese +from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls +from agents.get_info_prompts import * benchmark_model = 'Qwen/Qwen2.5-72B-Instruct' -models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'internlm/internlm2_5-20b-chat'] -async def main(texts: list[str], link_dict: dict, record_file: str, sys_prompt: str, focus_points: list): - # first get more links - print(f'sys_prompt: \n{sys_prompt}') - benchmark_result = None +models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5'] + +async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str): + link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents'] + get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts + for model in [benchmark_model] + models: - _texts = [] - for text in texts: - _texts.extend(text.split('\n\n')) + links_texts = [] + for _parts in links_parts: + links_texts.extend(_parts.split('\n\n')) + contents = sample['contents'].copy() + print(f"running {model} ...") start_time = time.time() - hallucination_times = 0 - text_batch = '' - cache = set() - while _texts: - t = _texts.pop(0) - text_batch = f'{text_batch}{t}\n\n' - if len(text_batch) > 512 or len(_texts) == 0: - content = f'\n{text_batch}\n\n{get_info_suffix}' - result = await llm( - [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], - model=model, temperature=0.1) - print(f"llm output\n{result}\n") - result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - if result: - # 在result[-1]中找到所有类似[4]这样的片段 - links = re.findall(r'\[\d+\]', result[-1]) - for link in links: - if link not in text_batch: - hallucination_times += 1 - print(f'\n**not in text_batch: {link}**\n') - continue - cache.add(link) - text_batch = '' - - t1 = time.time() - get_infos_time = t1 - start_time - print(f"get more infos time: {get_infos_time}") - print("*" * 12) - print('\n\n') - - for link in cache: - if link not in link_dict: - print(f'\n**not in link_dict: {link}**\n') - if model == benchmark_model: - benchmark_result = cache.copy() - diff = 'benchmark' + if include_ap: + author, publish_date = await get_author_and_publish_date(contents[0], model, test_mode=True) + get_ap_time = time.time() - start_time + print(f"get author and publish date time: {get_ap_time}") else: - # 计算当前cache与benchmark的差异 - missing_in_cache = len(benchmark_result - cache) # benchmark中有但cache中没有的 - extra_in_cache = len(cache - benchmark_result) # cache中有但benchmark中没有的 + author, publish_date = '', '' + get_ap_time = 0 + + start_time = time.time() + more_url = await get_more_related_urls(links_texts, link_dict, [get_link_sys_prompt, get_link_suffix_prompt, model], test_mode=True) + get_more_url_time = time.time() - start_time + print(f"get more related urls time: {get_more_url_time}") + + start_time = time.time() + infos = await get_info(contents, link_dict, [get_info_sys_prompt, get_info_suffix_prompt, model], focus_dict, author, publish_date, test_mode=True) + get_info_time = time.time() - start_time + print(f"get info time: {get_info_time}") + + if model == benchmark_model: + benchmark_result = more_url.copy() + diff = f'benchmark: {len(benchmark_result)} results' + else: + missing_in_cache = len(benchmark_result - more_url) # benchmark中有但cache中没有的 + extra_in_cache = len(more_url - benchmark_result) # cache中有但benchmark中没有的 total_diff = missing_in_cache + extra_in_cache diff = f'差异{total_diff}个(遗漏{missing_in_cache}个,多出{extra_in_cache}个)' - infos_to_record = '\n'.join(list(set(link_dict[link] for link in cache))) - + related_urls_to_record = '\n'.join(more_url) + infos_to_record = [f"{fi['tag']}: {fi['content']}" for fi in infos] + infos_to_record = '\n'.join(infos_to_record) with open(record_file, 'a') as f: - f.write(f"llm model: {model}\n") - f.write(f"process time: {get_infos_time} s\n") - f.write(f"bad generate times: {hallucination_times}\n") + f.write(f"model: {model}\n") + if include_ap: + f.write(f"get author and publish date time: {get_ap_time}\n") + f.write(f"author: {author}\n") + f.write(f"publish date: {publish_date}\n") + f.write(f"get more related urls time: {get_more_url_time}\n") f.write(f"diff from benchmark: {diff}\n") - f.write(f"segments: \n{infos_to_record}\n") - f.write("*" * 12) + f.write(f"get info time: {get_info_time}\n") + f.write(f"related urls: \n{related_urls_to_record}\n") + f.write(f"final result: \n{infos_to_record}\n") f.write('\n\n') - + print('\n\n') if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--sample_dir', '-D', type=str, default='') + parser.add_argument('--include_ap', '-I', type=bool, default=False) args = parser.parse_args() sample_dir = args.sample_dir - + include_ap = args.include_ap if not os.path.exists(os.path.join(sample_dir, 'focus_point.json')): raise ValueError(f'{sample_dir} focus_point.json not found') @@ -97,27 +95,43 @@ if __name__ == '__main__': expl = item["explanation"] focus_statement = f"{focus_statement}//{tag}//\n" if expl: - focus_statement = f"{focus_statement}解释:{expl}\n" + if is_chinese(expl): + focus_statement = f"{focus_statement}解释:{expl}\n" + else: + focus_statement = f"{focus_statement}Explanation: {expl}\n" + + focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points} + date_stamp = datetime.now().strftime('%Y-%m-%d') + if is_chinese(focus_statement): + get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"今天的日期是{date_stamp},{get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix + get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"今天的日期是{date_stamp},{get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix + else: + get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement) + get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}" + get_link_suffix_prompt = get_link_suffix_en + get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement) + get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}" + get_info_suffix_prompt = get_info_suffix_en - get_info_system = get_info_system.replace('{focus_statement}', focus_statement) - system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')},{get_info_system}" - focus_points = [item["focuspoint"] for item in focus_points] + prompts = [get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt] time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) record_file = os.path.join(sample_dir, f'record-{time_stamp}.txt') with open(record_file, 'w') as f: f.write(f"focus statement: \n{focus_statement}\n\n") - for dirs in os.listdir(sample_dir): - if not os.path.isdir(os.path.join(sample_dir, dirs)): + for file in os.listdir(sample_dir): + if not file.endswith('_processed.json'): continue - _path = os.path.join(sample_dir, dirs) - print(f'start testing {_path}') - if 'sample.json' not in os.listdir(_path): - print(f'{dirs} sample.json not found, skip') + sample = json.load(open(os.path.join(sample_dir, file), 'r')) + if 'links_part' not in sample or 'link_dict' not in sample or 'contents' not in sample: + print(f'{file} not valid sample, skip') continue - sample = json.load(open(os.path.join(_path, 'sample.json'), 'r')) - with open(record_file, 'a') as f: - f.write(f"raw materials in: {dirs}\n\n") - asyncio.run(main(sample['links_part'], sample['link_dict'], record_file, system_prompt, focus_points)) + f.write(f"raw materials: {file}\n\n") + print(f'start testing {file}') + asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file)) diff --git a/test/deep_scraper_test.py b/test/pre_process_test.py similarity index 51% rename from test/deep_scraper_test.py rename to test/pre_process_test.py index d7cb1f0..cb00473 100644 --- a/test/deep_scraper_test.py +++ b/test/pre_process_test.py @@ -2,14 +2,16 @@ import os import sys import re -current_dir = os.path.dirname(os.path.abspath(__file__)) -project_root = os.path.dirname(current_dir) # 获取父目录 -sys.path.append(project_root) +# 将core目录添加到Python路径 +core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core') +sys.path.append(core_path) -from core.scrapers.deep_scraper import deep_scraper, common_chars -from core.scrapers.mp_scraper import mp_scraper +# 现在可以直接导入模块,因为core目录已经在Python路径中 +from scrapers import * +from agents.get_info import pre_process def check_url_text(text): + common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#' print(f"processing: {text}") left_bracket = text.find('[') right_paren = text.rfind(')') @@ -56,25 +58,75 @@ def check_url_text(text): for match in matches: print(match) +async def main(html_sample, record_file): + recognized_img_cache = {} + parsed_url = urlparse(html_sample['url']) + domain = parsed_url.netloc + if domain in custom_scrapers: + result = custom_scrapers[domain](html_sample) + raw_markdown = result.content + used_img = result.images + title = result.title + base_url = result.base + author = result.author + publish_date = result.publish_date + else: + raw_markdown = html_sample['markdown'] + media_dict = html_sample['media'] if html_sample['media'] else {} + used_img = [d['src'] for d in media_dict.get('images', [])] + title = '' + base_url = '' + author = '' + publish_date = '' + + if not raw_markdown: + print(f"no raw_markdown for {file}") + return + + if not title: + title = html_sample.get('title', '') + if not base_url: + base_url = html_sample.get('base', '') + if not base_url: + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + if not base_url.endswith('/'): + base_url = base_url.rsplit('/', 1)[0] + '/' + + if not author: + author = html_sample.get('author', '') + if not publish_date: + publish_date = html_sample.get('publish_date', '') + + link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, test_mode=True) + result = { + "link_dict": link_dict, + "links_part": links_parts, + "contents": contents, + } + + with open(record_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=4, ensure_ascii=False) + print(f"pre process done, saved to {record_file}") + + if __name__ == '__main__': import argparse - import time import json from urllib.parse import urlparse + import asyncio parser = argparse.ArgumentParser() parser.add_argument('--test_file', '-F', type=str, default='') parser.add_argument('--sample_dir', '-D', type=str, default='') - parser.add_argument('--test_string', '-T', type=str, default='') + parser.add_argument('--record_folder', '-R', type=str, default='') args = parser.parse_args() - if args.test_string: - check_url_text(args.test_string) - exit() - test_file = args.test_file sample_dir = args.sample_dir - + record_folder = args.record_folder + if record_folder: + os.makedirs(record_folder, exist_ok=True) + files = [] if test_file: files.append(test_file) @@ -84,43 +136,9 @@ if __name__ == '__main__': for file in files: if not file.endswith('.json'): continue - print(f"processing {file} ...") - try: - with open(file, 'r') as f: - html_sample = json.load(f) - _url = html_sample['url'] - if _url.startswith('https://mp.weixin.qq.com'): - result = mp_scraper(html_sample) - raw_markdown = result.content - used_img = result.images - else: - raw_markdown = html_sample['markdown'] - used_img = [d['src'] for d in html_sample['media']['images']] - except Exception as e: - print('sample format error, try to use craw4ai_fething.py to get sample') - print(f"error: {e}") - continue - - parsed_url = urlparse(_url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" - if not base_url.endswith('/'): - # 如果路径不以 / 结尾,则去掉最后一个路径段 - base_url = base_url.rsplit('/', 1)[0] + '/' - - time_start = time.time() - link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img) - time_end = time.time() - #print(f"time cost for html: {time_end - time_start}s") - - result = { - "link_dict": link_dict, - "links_part": links_part, - "contents": contents, - } - record_folder = file.replace('.json', '') - os.makedirs(record_folder, exist_ok=True) - with open(os.path.join(record_folder, 'sample.json'), 'w', encoding='utf-8') as f: - json.dump(result, f, indent=4, ensure_ascii=False) - #print("done") - #print("*" * 12) + with open(file, 'r') as f: + html_sample = json.load(f) + record_file = os.path.join(record_folder, f'{os.path.basename(file)}_processed.json') + + asyncio.run(main(html_sample, record_file)) diff --git a/test/prompts.py b/test/prompts.py deleted file mode 100644 index ad5005d..0000000 --- a/test/prompts.py +++ /dev/null @@ -1,94 +0,0 @@ - -get_info_system = '''你将被给到一段使用标签包裹的网页文本,你的任务是从前到后仔细阅读文本,提取出与如下任一关注点相关的原文片段。关注点及其解释如下: - -{focus_statement}\n -在进行提取时,请遵循以下原则: -- 理解关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围 -- 在满足上面原则的前提下,提取出全部可能相关的片段 -- 提取出的原文片段务必保留类似"[3]"这样的引用标记,后续的处理需要用到这些引用标记''' - -get_info_suffix = '''请逐条输出提取的原文片段,并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。 -如下是输出格式示例:: -""" -原文片段1 -原文片段2 -... -"""''' - -text_info_system = '''你将被给到一段使用标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下: - -{focus_statement}\n -在提炼摘要时,请遵循以下原则: -- 理解每个关注点的含义以及进一步的解释(如有),确保摘要与关注点强相关并符合解释(如有)的范围 -- 摘要应当详实、充分 -- 摘要信息务必忠于原文''' - -text_info_suffix = '''请对关注点逐一生成摘要,不要遗漏任何关注点,如果网页文本与关注点无关,可以对应输出"NA"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: -""" -//关注点1// -摘要1 -//关注点2// -摘要2 -//关注点3// -NA -... -"""''' - -text_link_system = '''你将被给到数行格式为"<编号>//内容//"的文本,你的任务是逐条分析这些文本,并分别与如下关注点之一相关联。关注点列表及其解释如下: - -{focus_statement}\n -在进行关联分析时,请遵循以下原则: - -- 理解每个关注点的含义 -- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围''' - -text_link_suffix = '''请分行逐条输出结果,每一条的输出格式为"<编号>//关注点名称//",如果某条内容不与任何关注点相关,请输出"<编号>//NA//"。输出结果整体用三引号包裹,三引号内不要有其他内容。如下是输出格式示例: -""" -//关注点1名称// -//关注点2名称// -//NA// -... -"""''' - -text_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA" -text_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content): -"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""''' - - -verified_system = '''判断给定的信息是否与网页文本相符。信息将用标签包裹,网页文本则用包裹。请遵循如下工作流程: -1、尝试找出网页文本中所有与信息对应的文本片段(可能有多处); -2、基于这些片段给出是否相符的最终结论,最终结论仅为“是”或“否”''' -verified_suffix = '先输出找到的所有文本片段,再输出最终结论(仅为是或否)' - - -image_info_system = '''作为信息提取助手,你的任务是从给定的网页截屏中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下: - -{focus_statement}\n -在进行信息提取时,请遵循以下原则: - -- 理解每个兴趣点的含义,确保提取的内容与之相关。 -- 如果兴趣点有进一步的解释,确保提取的内容符合这些解释的范围。 -- 忠于原文,你的任务是从网页截屏中识别和提取与各个兴趣点相关的信息,并不是总结和提炼。''' - -image_info_suffix = '''如果网页截屏中包含兴趣点相关的内容,请按照以下json格式输出提取的信息(文本中可能包含多条有用信息,请不要遗漏): -[{"focus": 兴趣点名称, "content": 提取的内容}] - -示例: -[{"focus": "旅游景点", "content": "北京故宫,地址:北京市东城区景山前街4号,开放时间:8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}] - -如果截屏中不包含任何与兴趣点相关的信息或者你判断这是一个文章列表页面,请仅输出:[]。''' - -image_link_system = "作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的网页截屏中挑选出最值得关注的链接推荐给用户进一步点击查看。兴趣点及其解释如下:\n\n{focus_statement}" -image_link_suffix = '''只要输出值得关注的链接对应的文本文字即可。按一行一条的格式输出,最终输出的列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: -""" -链接文字1 -链接文字2 -... -"""''' - -image_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage screenshot. If the screenshot does not contain a particular piece of information, please replace it with NA" -image_ap_suffix = '''Please output the extracted information in the following JSON format: -{"source": source or article author (use "NA" if this information cannot be found), "publish_date": publication date (keep only the year, month, and day; use "NA" if this information cannot be found)}''' - -image_system = "提取图片中的所有文字,如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等,则输出NA。注意请仅输出提取出的文字,不要输出别的任何内容。" -image_system_en = "Extract all text from the image. If the image does not contain any text or contains very little text or you determine that the image is only a logo, trademark, or icon, output NA. Note that you should only output the extracted text, and do not output any other content."