0.3.7 release

2025-01-23 02:20:20 +08:00 · 2025-01-17 23:28:22 +08:00 · 2025-01-17 23:28:22 +08:00 · dd7d92476e
commit dd7d92476e
parent e2f3903bb8
18 changed files with 764 additions and 839 deletions
--- a/README.md
+++ b/README.md
@ -115,6 +115,7 @@ siliconflow（硅基流动）提供大部分主流开源模型的在线 MaaS 服
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://api.siliconflow.cn/v1"
 export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
 export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
 export VL_MODEL="OpenGVLab/InternVL2-26B"
 ```
@ -129,6 +130,7 @@ export VL_MODEL="OpenGVLab/InternVL2-26B"
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://aihubmix.com/v1" # 具体参考 https://doc.aihubmix.com/
 export PRIMARY_MODEL="gpt-4o"
 export SECONDARY_MODEL="gpt-4o-mini"
 export VL_MODEL="gpt-4o"
 ```
--- a/README_EN.md
+++ b/README_EN.md
@ -114,6 +114,7 @@ Siliconflow provides online MaaS services for most mainstream open-source models
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://api.siliconflow.cn/v1"
 export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
 export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
 export VL_MODEL="OpenGVLab/InternVL2-26B"
 ```
@ -129,6 +130,7 @@ When using AiHubMix models, the .env configuration can refer to the following:
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://aihubmix.com/v1" # refer to https://doc.aihubmix.com/
 export PRIMARY_MODEL="gpt-4o"
 export SECONDARY_MODEL="gpt-4o-mini"
 export VL_MODEL="gpt-4o"
 ```
--- a/README_JP.md
+++ b/README_JP.md
@ -114,6 +114,7 @@ Siliconflowは、主流のオープンソースモデルのほとんどにオン
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://api.siliconflow.cn/v1"
 export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
 export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
 export VL_MODEL="OpenGVLab/InternVL2-26B"
 ```
@ -129,6 +130,7 @@ AiHubMixモデルを使用する場合、.envの設定は以下を参考にし
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://aihubmix.com/v1" # referhttps://doc.aihubmix.com/
 export PRIMARY_MODEL="gpt-4o"
 export SECONDARY_MODEL="gpt-4o-mini"
 export VL_MODEL="gpt-4o"
 ```
 😄 [AiHubMixの紹介リンク](https://aihubmix.com?aff=Gp54)からご登録いただけますと幸いです 🌹
--- a/README_KR.md
+++ b/README_KR.md
@ -114,6 +114,7 @@ Siliconflow는 대부분의 주류 오픈소스 모델에 대한 온라인 MaaS
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://api.siliconflow.cn/v1"
 export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
 export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
 export VL_MODEL="OpenGVLab/InternVL2-26B"
 ```
@ -129,6 +130,7 @@ AiHubMix 모델을 사용할 때 .env 구성은 다음을 참조할 수 있습
 export LLM_API_KEY=Your_API_KEY
 export LLM_API_BASE="https://aihubmix.com/v1" # refer https://doc.aihubmix.com/
 export PRIMARY_MODEL="gpt-4o"
 export SECONDARY_MODEL="gpt-4o-mini"
 export VL_MODEL="gpt-4o"
 ```
--- a/core/scrapers/action_dict_scraper.py
+++ b/core/scrapers/action_dict_scraper.py
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@ -1,15 +1,222 @@
 # -*- coding: utf-8 -*-
 import asyncio
 from loguru import logger
 import os, re
 from utils.pb_api import PbTalker
 from llms.openai_wrapper import openai_llm as llm
 # from core.llms.siliconflow_wrapper import sfa_llm # or other llm wrapper
-from utils.general_utils import is_chinese, extract_and_convert_dates
+from utils.general_utils import is_chinese, extract_and_convert_dates, normalize_url
 from .get_info_prompts import *
-async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]:
+common_file_exts = [
    'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
    'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
    'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
    'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
    'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
 ]
 common_tlds = [
    '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
    '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
    '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
    '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
 ]
 async def pre_process(raw_markdown: str, base_url: str, used_img: list[str], 
                        recognized_img_cache: dict, existing_urls: set = set(), 
                        test_mode: bool = False) -> tuple[dict, list[str], list[str], dict]:
    link_dict = {}
    # for special url formate from crawl4ai 0.4.247
    raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
    # 处理图片标记 ![alt](src)
    i_pattern = r'(!\[(.*?)\]\((.*?)\))'
    matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
    for _sec, alt, src in matches:
        # 替换为新格式 §alt||src§
        raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
    async def check_url_text(text) -> tuple[int, str]:
        score = 0
        _valid_len = len(text.strip())
        # 找到所有[part0](part1)格式的片段
        link_pattern = r'(\[(.*?)\]\((.*?)\))'
        matches = re.findall(link_pattern, text, re.DOTALL)
        for _sec, link_text, link_url in matches:
            # 处理 \"***\" 格式的片段
            quote_pattern = r'\"(.*?)\"'
            # 提取所有引号包裹的内容
            _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
            _title = _title.strip()
            link_text = link_text.strip()
            if _title and _title not in link_text:
                link_text = f"{_title} - {link_text}"
            real_url_pattern = r'<(.*?)>'
            real_url = re.search(real_url_pattern, link_url, re.DOTALL)
            if real_url:
                _url = real_url.group(1).strip()
            else:
                _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
            if not _url or _url.startswith(('#', 'javascript:')):
                text = text.replace(_sec, link_text, 1)
                continue
            score += 1
            _valid_len = _valid_len - len(_sec)
            url = normalize_url(_url, base_url)
            # 分离§§内的内容和后面的内容
            img_marker_pattern = r'§(.*?)\|\|(.*?)§'
            inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
            for alt, src in inner_matches:
                link_text = link_text.replace(f'§{alt}||{src}§', '')
            if not link_text and inner_matches:
                img_alt = inner_matches[0][0].strip()
                img_src = inner_matches[0][1].strip()
                if img_src and not img_src.startswith('#'):
                    img_src = normalize_url(img_src, base_url)
                    if not img_src:
                        link_text = img_alt
                    elif len(img_alt) > 2 or url in existing_urls:
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    else:
                        if img_src not in recognized_img_cache:
                            recognized_img_cache[img_src] = await extract_info_from_img(img_src)
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = recognized_img_cache[img_src] + _key
                else:
                    link_text = img_alt
            _key = f"[{len(link_dict)+1}]"
            link_dict[_key] = url
            text = text.replace(_sec, link_text + _key, 1)
        # 处理文本中的其他图片标记
        img_pattern = r'(§(.*?)\|\|(.*?)§)'
        matches = re.findall(img_pattern, text, re.DOTALL)
        remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
        remained_text_len = len(remained_text)
        for _sec, alt, src in matches:
            if not src or src.startswith('#') or src not in used_img:
                text = text.replace(_sec, alt, 1)
                continue
            img_src = normalize_url(src, base_url)
            if not img_src:
                text = text.replace(_sec, alt, 1)
            elif remained_text_len > 5 or len(alt) > 2:
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            else:
                if img_src not in recognized_img_cache:
                    recognized_img_cache[img_src] = await extract_info_from_img(img_src)
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, recognized_img_cache[img_src] + _key, 1)
        # 处理文本中的"野 url"
        url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
        matches = re.findall(url_pattern, text)
        for url in matches:
            url = normalize_url(url, base_url)
            _key = f"[{len(link_dict)+1}]"
            link_dict[_key] = url
            text = text.replace(url, _key, 1)
            score += 1
            _valid_len = _valid_len - len(url)
        # 统计换行符数量
        newline_count = text.count(' * ')
        score += newline_count
        ratio = _valid_len/score if score != 0 else 999
        return ratio, text
    sections = raw_markdown.split('# ') # use '# ' to avoid # in url
    if len(sections) > 2:
        _sec = sections[0]
        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
        section_remain_len = len(section_remain)
        total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
        ratio = total_links / section_remain_len if section_remain_len != 0 else 1
        if ratio > 0.05:
            if test_mode:
                print('this is a navigation section, will be removed')
                print(ratio)
                print(section_remain)
                print('-' * 50)
            sections = sections[1:]
        _sec = sections[-1]
        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
        section_remain_len = len(section_remain)
        if section_remain_len < 198:
            if test_mode:
                print('this is a footer section, will be removed')
                print(section_remain_len)
                print(section_remain)
                print('-' * 50)
            sections = sections[:-1]
    links_parts = []
    contents = []
    for section in sections:
        ratio, text = await check_url_text(section)
        if ratio < 70:
            if test_mode:
                print('this is a links part')
                print(ratio)
                print(text)
                print('-' * 50)
            links_parts.append(text)
        else:
            if test_mode:
                print('this is a content part')
                print(ratio)
                print(text)
                print('-' * 50)
            contents.append(text)
    return link_dict, links_parts, contents, recognized_img_cache
 vl_model = os.environ.get("VL_MODEL", "")
 if not vl_model:
    print("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
 async def extract_info_from_img(url: str) -> str:
    if not vl_model:
        return '§to_be_recognized_by_visual_llm§'
    llm_output = await llm([{"role": "user",
        "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
        {"type": "text", "text": "提取图片中的所有文字，如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等，则输出NA。注意请仅输出提取出的文字，不要输出别的任何内容。"}]}],
        model=vl_model)
    return llm_output
 async def get_author_and_publish_date(text: str, model: str, test_mode: bool = False, _logger: logger = None) -> tuple[str, str]:
    if not text:
        return "", ""
@ -19,245 +226,122 @@ async def get_author_and_publish_date(text: str, model: str) -> tuple[str, str]:
    if len(text) > 2048:
        text = f'{text[:2048]}......'
-    system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
+    content = f'<text>\n{text}\n</text>\n\n{get_ap_suffix}'
-    suffix = '''Please output the extracted information in the following format(output only the result, no other content):
+    llm_output = await llm([{'role': 'system', 'content': get_ap_system}, {'role': 'user', 'content': content}],
-"""source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
+                            model=model, max_tokens=50, temperature=0.1)
-
+    if test_mode:
-    content = f'<text>\n{text}\n</text>\n\n{suffix}'
+        print(f"llm output:\n {llm_output}")
    llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
                           model=model, max_tokens=50, temperature=0.1)
    ap_ = llm_output.strip().strip('"').strip('//')
    if '//' not in ap_:
-        print(f"failed to parse from llm output: {ap_}")
+        if _logger:
            _logger.warning(f"failed to parse from llm output: {ap_}")
        return '', ''
    ap = ap_.split('//')
    return ap[0], extract_and_convert_dates(ap[1])
-async def extract_info_from_img(task: list, vl_model: str) -> dict:
+async def get_more_related_urls(texts: list[str], link_dict: dict, prompts: list[str], test_mode: bool = False,
-    cache = {}
+                                _logger: logger = None) -> set:
-    for url in task:
+    
-        llm_output = await llm([{"role": "user",
+    sys_prompt, suffix, model = prompts
-        "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
+    text_batch = ''
-        {"type": "text", "text": "提取图片中的所有文字，如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等，则输出NA。注意请仅输出提取出的文字，不要输出别的任何内容。"}]}],
+    cache = set()
-        model=vl_model)
+    while texts:
-
+        t = texts.pop(0)
-        cache[url] = llm_output
+        text_batch = f'{text_batch}{t}\n\n'
-    return cache
+        if len(text_batch) > 2048 or len(texts) == 0:
 class GeneralInfoExtractor:
    def __init__(self, pb: PbTalker, _logger: logger) -> None:
        self.pb = pb
        self.logger = _logger
        self.model = os.environ.get("PRIMARY_MODEL", "")
        if not self.model:
            self.logger.error("PRIMARY_MODEL not set, can't continue")
            raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
        # collect tags user set in pb database and determin the system prompt language based on tags
        focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
        if not focus_data:
            self.logger.info('no activated tag found, will ask user to create one')
            focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
                          'so please input one now. describe what info you care about shortly: ')
            explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
            focus_data.append({"focuspoint": focus, "explanation": explanation,
                               "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
        # self.focus_list = [item["focuspoint"] for item in focus_data]
        self.focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
        focus_statement = ''
        for item in focus_data:
            tag = item["focuspoint"]
            expl = item["explanation"]
            focus_statement = f"{focus_statement}//{tag}//\n"
            if expl:
                if is_chinese(expl):
                    focus_statement = f"{focus_statement}解释：{expl}\n"
                else:
                    focus_statement = f"{focus_statement}Explanation: {expl}\n"
        if is_chinese(focus_statement):
            self.get_info_prompt = f'''你将被给到一段使用<text></text>标签包裹的网页文本，请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下：
 {focus_statement}\n
 在提炼摘要时，请遵循以下原则：
 - 理解每个关注点的含义以及进一步的解释（如有），确保摘要与关注点强相关并符合解释（如有）的范围
 - 摘要应当详实、充分，使用简体中文（如果原文是英文，请翻译成简体中文）
 - 摘要信息务必忠于原文'''
            self.get_info_suffix = '''请对关注点逐一生成摘要，不要遗漏任何关注点，如果网页文本与关注点无关，可以对应输出"NA"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
 //关注点1//
 摘要1
 //关注点2//
 摘要2
 //关注点3//
 NA
 ...
 """'''
            self.get_more_link_prompt = f'''你将被给到数行格式为"<编号>//内容//"的文本，你的任务是逐条分析这些文本，并分别与如下关注点之一相关联。关注点列表及其解释如下：
 {focus_statement}\n
 在进行关联分析时，请遵循以下原则：
 - 理解每个关注点的含义
 - 如果关注点有进一步的解释，确保提取的内容符合这些解释的范围'''
            self.get_more_link_suffix = '''请分行逐条输出结果，每一条的输出格式为"<编号>//关注点名称//"，如果某条内容不与任何关注点相关，请输出"<编号>//NA//"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
 <t1>//关注点1名称//
 <t2>//关注点2名称//
 <t3>//NA//
 ...
 """'''
        else:
            self.get_info_prompt = f'''You will be given a webpage text wrapped in <text></text> tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows:
 {focus_statement}\n
 When extracting summaries, please follow these principles:
 - Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
 - The summary should be detailed and comprehensive
 - The summary should be faithful to the original text'''
            self.get_info_suffix = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
 """
 //Focus Point 1//
 Summary 1
 //Focus Point 2//
 Summary 2
 //Focus Point 3//
 NA
 ...
 """'''
            self.get_more_link_prompt = f'''You will be given several lines of text in the format "<index>//content//". Your task is to analyze each line and associate it with one of the following focus points. The list of focus points and their explanations are as follows:
 {focus_statement}\n
 When performing the association analysis, please follow these principles:
 - Understand the meaning of each focus point
 - If a focus point has further explanation, ensure the extracted content aligns with the scope of these explanations'''
            self.get_more_link_suffix = '''Please output the results line by line. Each line should be in the format "<index>//focus point name//". If a line is not related to any focus point, output "<index>//NA//". The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
 """
 <t1>//Focus Point 1//
 <t2>//Focus Point 2// 
 <t3>//NA//
 ...
 """'''
    async def _generate_results(self, lines: list, mode: str) -> set:
        if mode == 'get_info':
            system_prompt = self.get_info_prompt
            suffix = self.get_info_suffix
            batch_size = 5000
        elif mode == 'get_link':
            system_prompt = self.get_more_link_prompt
            suffix = self.get_more_link_suffix
            batch_size = 2048
        else:
            self.logger.error(f"unknown mode: {mode}")
            return set()
        cache = set()
        batches = []
        text_batch = ''
        for line in lines:
            text_batch += f'{line}\n'
            if len(text_batch) > batch_size:
                content = f'<text>\n{text_batch}</text>\n\n{suffix}'
                batches.append({'system_prompt': system_prompt, 'content': content})
                text_batch = ''
        if text_batch:
            content = f'<text>\n{text_batch}</text>\n\n{suffix}'
-            batches.append({'system_prompt': system_prompt, 'content': content})
+            result = await llm(
                    [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
                    model=model, temperature=0.1)
-        self.logger.info(f"LLM tasks size: {len(batches)}")
+            result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-        tasks = [
+            if test_mode:
-            llm(
+                print(f"llm output:\n {result}")
-                    [{'role': 'system', 'content': batch['system_prompt']}, {'role': 'user', 'content': batch['content']}],
+            if result:
-                    model=self.model, temperature=0.1
+                links = re.findall(r'\[\d+\]', result[-1])
-                )
+                for link in links:
-            for batch in batches]
+                    if link not in text_batch:
-        results = await asyncio.gather(*tasks)
+                        if _logger:
-        for res in results:
+                            _logger.warning(f"model generating hallucination:\n{result[-1]}")
-            if res:
+                        if test_mode:
-                extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL)
+                            print(f"model hallucination:\n{result[-1]}")
-                if extracted_result:
+                        continue
-                    cache.add(extracted_result[-1])
+                    cache.add(link)
            text_batch = ''
-        return cache
+    more_urls = set()
    for mark in cache:
        url = link_dict[mark]
        has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
        has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
        if has_common_ext or has_common_tld:
            continue
        more_urls.add(url)
    return more_urls
-    async def get_more_related_urls(self, link_dict: dict) -> set:
+async def get_info(texts: list[str], link_dict: dict, prompts: list[str], focus_dict: dict, author: str, publish_date: str,
-        _to_be_processed = []
+                   test_mode: bool = False, _logger: logger = None) -> list[dict]:
        link_map = {}
        for i, (url, des) in enumerate(link_dict.items()):
            des = des.replace('\n', ' ')
            _to_be_processed.append(f'<t{i+1}>//{des}//')
            link_map[f'<t{i+1}'] = url
-        raw_result = await self._generate_results(_to_be_processed, 'get_link')
+    sys_prompt, suffix, model = prompts
        final_result = set()
        for result in raw_result:
            for item in result.split('\n'):
                if not item:
                    continue
                segs = item.split('>')
                if len(segs) != 2:
                    self.logger.debug(f"bad generate result: {item}")
                    continue
                _index, focus = segs
                _index = _index.strip()
                focus = focus.strip().strip('//')
                if focus == 'NA':
                    continue
                if focus not in self.focus_dict or _index not in link_map:
                    self.logger.debug(f"bad generate result: {item}")
                    continue
                # self.logger.debug(f"{link_map[_index]} selected")
                final_result.add(link_map[_index])
        return final_result
-    async def get_info(self, text: str, text_links: dict, info_pre_fix: str) -> list[dict]:
+    if test_mode:
-        raw_result = await self._generate_results(text.split('\n'), 'get_info')
+        info_pre_fix = ''
-        final = []
+    else:
-        for item in raw_result:
+        info_pre_fix = f"//{author} {publish_date}//"
            self.logger.debug(f"llm output:\n{item}")
            segs = item.split('//')
            i = 0
            while i < len(segs) - 1:
                focus = segs[i].strip()
                if not focus:
                    i += 1
                    continue
                if focus not in self.focus_dict:
                    self.logger.debug(f"bad generate result: {item}")
                    i += 1
                    continue
                content = segs[i+1].strip().strip('摘要').strip(':').strip('：')
                i += 2
                if not content or content == 'NA':
                    continue
                """
                maybe can use embedding retrieval to judge
                """
-                url_tags = re.findall(r'\[(Ref_\d+)]', content)
+    cache = set()
-                refences = {url_tag: text_links[url_tag] for url_tag in url_tags if url_tag in text_links}
+    batches = []
    text_batch = ''
    while texts:
        t = texts.pop(0)
        text_batch = f'{text_batch}{t}# '
        if len(text_batch) > 9999 or len(texts) == 0:
            content = f'<text>\n{text_batch}</text>\n\n{suffix}'
            batches.append(content)
            text_batch = ''
-                final.append({'tag': self.focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
+    tasks = [
        llm([{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], model=model, temperature=0.1)
        for content in batches]
    results = await asyncio.gather(*tasks)
    for res in results:
        if test_mode:
            print(f"llm output:\n {res}")
        extracted_result = re.findall(r'\"\"\"(.*?)\"\"\"', res, re.DOTALL)
        if extracted_result:
            cache.add(extracted_result[-1])
    final = []
    for item in cache:
        segs = item.split('//')
        i = 0
        while i < len(segs) - 1:
            focus = segs[i].strip()
            if not focus:
                i += 1
                continue
            if focus not in focus_dict:
                if _logger:
                    _logger.info(f"llm hallucination: {item}")
                if test_mode:
                    print(f"llm hallucination: {item}")
                i += 1
                continue
            content = segs[i+1].strip().strip('摘要').strip(':').strip('：')
            i += 2
            if not content or content == 'NA':
                continue
            """
            maybe can use embedding retrieval to judge
            """
            url_tags = re.findall(r'\[\d+\]', content)
            refences = {url_tag: link_dict[url_tag] for url_tag in url_tags if url_tag in link_dict}
            final.append({'tag': focus_dict[focus], 'content': f"{info_pre_fix}{content}", 'references': refences})
-        return final
+    return final
    async def __call__(self, link_dict: dict, text: str, text_links: dict, author: str, publish_date: str) -> tuple[set, list]:
        info_prefix = f"//{author} {publish_date}//"
        return await self.get_more_related_urls(link_dict), await self.get_info(text, text_links, info_prefix)
--- a/core/agents/get_info_prompts.py
+++ b/core/agents/get_info_prompts.py
@ -0,0 +1,74 @@
 get_link_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，你的任务是从前到后仔细阅读文本，提取出与如下任一关注点相关的原文片段。关注点及其解释如下：
 {focus_statement}\n
 在进行提取时，请遵循以下原则：
 - 理解关注点的含义以及进一步的解释（如有），确保提取的内容与关注点强相关并符合解释（如有）的范围
 - 在满足上面原则的前提下，提取出全部可能相关的片段
 - 提取出的原文片段务必保留类似"[3]"这样的引用标记，后续的处理需要用到这些引用标记'''
 get_link_suffix = '''请逐条输出提取的原文片段，并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容，如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
 如下是输出格式示例：：
 """
 原文片段1
 原文片段2
 ...
 """'''
 get_link_system_en = '''You will be given a webpage text wrapped in <text></text> tags. Your task is to carefully read the text from beginning to end, extracting fragments related to any of the following focus points. The focus points and their explanations are as follows:
 {focus_statement}\n
 When extracting fragments, please follow these principles:
 - Understand the meaning of each focus point and its explanation (if any), ensure the extracted content strongly relates to the focus point and aligns with the explanation (if any)
 - Extract all possible related fragments
 - Ensure the extracted fragments retain the reference markers like "[3]", as these will be used in subsequent processing'''
 get_link_suffix_en = '''Please output each extracted fragment one by one, and wrap the entire output in triple quotes. The triple quotes should contain only the extracted fragments, with no other content. If the text does not contain any content related to the focus points, keep the triple quotes empty.
 Here is an example of the output format:
 """
 Fragment 1
 Fragment 2
 ...
 """'''
 get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下：
 {focus_statement}\n
 在提炼摘要时，请遵循以下原则：
 - 理解每个关注点的含义以及进一步的解释（如有），确保摘要与关注点强相关并符合解释（如有）的范围
 - 摘要应当详实、充分，使用简体中文（如果原文是英文，请翻译成简体中文）
 - 摘要信息务必忠于原文'''
 get_info_suffix = '''请对关注点逐一生成摘要，不要遗漏任何关注点，如果网页文本与关注点无关，可以对应输出"NA"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
 //关注点1//
 摘要1
 //关注点2//
 摘要2
 //关注点3//
 NA
 ...
 """'''
 get_info_system_en = '''You will be given a webpage text wrapped in <text></text> tags. Please extract summaries from the text according to the following focus points. The list of focus points and their explanations are as follows:
 {focus_statement}\n
 When extracting summaries, please follow these principles:
 - Understand the meaning of each focus point and its explanation (if any), ensure the summary strongly relates to the focus point and aligns with the explanation (if any)
 - The summary should be detailed and comprehensive
 - The summary should be faithful to the original text'''
 get_info_suffix_en = '''Please generate summaries for each focus point, don't miss any focus points. If the webpage text is not related to a focus point, output "NA" for that point. The entire output should be wrapped in triple quotes with no other content inside. Here is an example of the output format:
 """
 //Focus Point 1//
 Summary 1
 //Focus Point 2//
 Summary 2
 //Focus Point 3//
 NA
 ...
 """'''
 get_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
 get_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content):
 """source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
--- a/core/general_process.py
+++ b/core/general_process.py
@ -1,11 +1,10 @@
 # -*- coding: utf-8 -*-
 from utils.pb_api import PbTalker
-from utils.general_utils import get_logger, extract_and_convert_dates
+from utils.general_utils import get_logger, extract_and_convert_dates, is_chinese
 from utils.deep_scraper import *
 from agents.get_info import *
 import json
 import asyncio
-from custom_fetchings import *
+from scrapers import *
 from urllib.parse import urlparse
 from crawl4ai import AsyncWebCrawler, CacheMode
 from datetime import datetime, timedelta
@ -19,18 +18,14 @@ if project_dir:
 wiseflow_logger = get_logger('general_process', project_dir)
 pb = PbTalker(wiseflow_logger)
 gie = GeneralInfoExtractor(pb, wiseflow_logger)
 one_month_ago = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
 existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'], filter=f"created>='{one_month_ago}'")}
-llm_model = os.environ.get("PRIMARY_MODEL", "")
+crawler = AsyncWebCrawler(verbose=False)
-vl_model = os.environ.get("VL_MODEL", "")
+model = os.environ.get("PRIMARY_MODEL", "")
-if not vl_model:
+if not model:
-    wiseflow_logger.warning("VL_MODEL not set, will skip extracting info from img, some info may be lost!")
+    raise ValueError("PRIMARY_MODEL not set, please set it in environment variables or edit core/.env")
-
+secondary_model = os.environ.get("SECONDARY_MODEL", model)
 img_to_be_recognized_pattern = r'§to_be_recognized_by_visual_llm_(.*?)§'
 recognized_img_cache = {}
 async def save_to_pb(url: str, url_title: str, infos: list):
    # saving to pb process
@ -46,112 +41,142 @@ async def save_to_pb(url: str, url_title: str, infos: list):
 async def main_process(_sites: set | list):
    # collect tags user set in pb database and determin the system prompt language based on tags
    focus_data = pb.read(collection_name='focus_points', filter=f'activated=True')
    if not focus_data:
        wiseflow_logger.info('no activated tag found, will ask user to create one')
        focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
                    'so please input one now. describe what info you care about shortly: ')
        explanation = input('Please provide more explanation for the focus point (if not necessary, pls just press enter: ')
        focus_data.append({"focuspoint": focus, "explanation": explanation,
                            "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
    focus_dict = {item["focuspoint"]: item["id"] for item in focus_data}
    focus_statement = ''
    for item in focus_data:
        tag = item["focuspoint"]
        expl = item["explanation"]
        focus_statement = f"{focus_statement}//{tag}//\n"
        if expl:
            if is_chinese(expl):
                focus_statement = f"{focus_statement}解释：{expl}\n"
            else:
                focus_statement = f"{focus_statement}Explanation: {expl}\n"
    date_stamp = datetime.now().strftime('%Y-%m-%d')
    if is_chinese(focus_statement):
        get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
        get_link_sys_prompt = f"今天的日期是{date_stamp}，{get_link_sys_prompt}"
        get_link_suffix_prompt = get_link_suffix
        get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
        get_info_sys_prompt = f"今天的日期是{date_stamp}，{get_info_sys_prompt}"
        get_info_suffix_prompt = get_info_suffix
    else:
        get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
        get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
        get_link_suffix_prompt = get_link_suffix_en
        get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
        get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
        get_info_suffix_prompt = get_info_suffix_en
    recognized_img_cache = {}
    working_list = set()
    working_list.update(_sites)
-    async with AsyncWebCrawler(headless=True, verbose=False) as crawler:
+    await crawler.start()
-        while working_list:
+    while working_list:
-            url = working_list.pop()
+        url = working_list.pop()
-            existing_urls.add(url)
+        existing_urls.add(url)
-            has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
+        has_common_ext = any(url.lower().endswith(ext) for ext in common_file_exts)
-            if has_common_ext:
+        if has_common_ext:
-                wiseflow_logger.info(f'{url} is a common file, skip')
+            wiseflow_logger.info(f'{url} is a common file, skip')
-                continue
+            continue
-            parsed_url = urlparse(url)
+        parsed_url = urlparse(url)
-            existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
+        existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}")
-            existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
+        existing_urls.add(f"{parsed_url.scheme}://{parsed_url.netloc}/")
-            domain = parsed_url.netloc
+        domain = parsed_url.netloc
-            if domain in custom_scrapers:
+        if domain in custom_fetching_configs:
-                wiseflow_logger.debug(f'{url} is a custom scraper, use custom scraper')
+            wiseflow_logger.debug(f'{url} will using custom crawl4ai run config')
-                raw_markdown, metadata_dict, media_dict = custom_scrapers[domain](url)
+            run_config = custom_fetching_configs[domain]
-            else:
+        else:
-                crawl4ai_cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
+            run_config = crawler_config
-                result = await crawler.arun(url=url, delay_before_return_html=2.0, wait_until='commit',
+            
-                                            magic=True, scan_full_page=True,
+        run_config.cache_mode = CacheMode.WRITE_ONLY if url in _sites else CacheMode.ENABLED
-                                            cache_mode=crawl4ai_cache_mode)
+        result = await crawler.arun(url=url, config=run_config)
-                if not result.success:
+        if not result.success:
-                    wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
+            wiseflow_logger.warning(f'{url} failed to crawl, destination web cannot reach, skip')
-                    continue
+            continue
        metadata_dict = result.metadata if result.metadata else {}
-                raw_markdown = result.markdown
+        if domain in custom_scrapers:
-                if not raw_markdown:
+            result = custom_scrapers[domain](result)
-                    wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
+            raw_markdown = result.content
-                    continue
+            used_img = result.images
-                metadata_dict = result.metadata if result.metadata else {}
+            title = result.title
-                media_dict = result.media if result.media else {}
+            base_url = result.base
            author = result.author
            publish_date = result.publish_date
        else:
            raw_markdown = result.markdown
            media_dict = result.media if result.media else {}
            used_img = [d['src'] for d in media_dict.get('images', [])]
            title = ''
            base_url = ''
            author = ''
            publish_date = ''
-            web_title = metadata_dict.get('title', '')
+        if not raw_markdown:
            wiseflow_logger.warning(f'{url} no content, something during fetching failed, skip')
            continue
        if not title:
            title = metadata_dict.get('title', '')
        if not base_url:
            base_url = metadata_dict.get('base', '')
-            if not base_url:
+        if not base_url:
-                base_url = url
+            base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
-
+        if not base_url.endswith('/'):
            # 如果路径不以 / 结尾，则去掉最后一个路径段
            base_url = base_url.rsplit('/', 1)[0] + '/'
        if not author:
            author = metadata_dict.get('author', '')
-            publish_date = extract_and_convert_dates(metadata_dict.get('publish_date', ''))
+        if not publish_date:
            publish_date = metadata_dict.get('publish_date', '')
-            img_dict = media_dict.get('images', [])
+        link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, existing_urls)
            if not img_dict or not isinstance(img_dict, list):
                used_img = []
            else:
                used_img = [d['src'] for d in img_dict]
-            link_dict, (text, reference_map) = deep_scraper(raw_markdown, base_url, used_img)
+        if link_dict and links_parts:
-            _duplicate_url = set(link_dict.keys()) & existing_urls
+            prompts = [get_link_sys_prompt, get_link_suffix_prompt, secondary_model]
-            for _d in _duplicate_url:
+            links_texts = []
-                del link_dict[_d]
+            for _parts in links_parts:
                links_texts.extend(_parts.split('\n\n'))
            more_url = await get_more_related_urls(links_texts, link_dict, prompts, _logger=wiseflow_logger)
            if more_url:
                working_list.update(more_url - existing_urls)
        if not contents:
            continue
-            to_be_replaces = {}
+        if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
-            for u, des in link_dict.items():
+            author, publish_date = await get_author_and_publish_date(raw_markdown, model, _logger=wiseflow_logger)
                matches = re.findall(img_to_be_recognized_pattern, des)
                if matches:
                    for img_url in matches:
                        if img_url in recognized_img_cache:
                            link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[img_url])
                            continue
                        link_dict[u] = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', img_url)
                        if img_url in to_be_replaces:
                            to_be_replaces[img_url].append(u)
                        else:
                            to_be_replaces[img_url] = [u]
            matches = re.findall(img_to_be_recognized_pattern, text)
            if matches:
                for img_url in matches:
                    if f'h{img_url}' in recognized_img_cache:
                        text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', recognized_img_cache[f'h{img_url}'])
                        continue
                    text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', f'h{img_url}')
                    img_url = f'h{img_url}'
                    if img_url in to_be_replaces:
                        to_be_replaces[img_url].append("content")
                    else:
                        to_be_replaces[img_url] = ["content"]
-            recognized_result = await extract_info_from_img(list(to_be_replaces.keys()), vl_model)
+        if not author or author.lower() == 'na':
-            wiseflow_logger.debug(f'total {len(recognized_result)} imgs be recognized')
+            author = parsed_url.netloc
-            recognized_img_cache.update({key: value for key, value in recognized_result.items() if value.strip()})
+        
-            for img_url, content in recognized_result.items():
+        if publish_date:
-                for u in to_be_replaces[img_url]:
+            publish_date = extract_and_convert_dates(publish_date)
-                    if u == "content":
+        else:
-                        text = text.replace(img_url, content)
+            publish_date = date_stamp
                    else:
                        link_dict[u] = link_dict[u].replace(img_url, content)
            if not author or author.lower() == 'na' or not publish_date or publish_date.lower() == 'na':
                author, publish_date = await get_author_and_publish_date(text, llm_model)
                wiseflow_logger.debug(f'get author and publish date by llm: {author}, {publish_date}')
            if not author or author.lower() == 'na':
                author = parsed_url.netloc
            if not publish_date:
                publish_date = datetime.now().strftime('%Y-%m-%d')
            more_urls, infos = await gie(link_dict, text, reference_map, author, publish_date)
            wiseflow_logger.debug(f'get {len(more_urls)} more urls and {len(infos)} infos')
            if more_urls:
                working_list.update(more_urls - existing_urls)
            if infos:
                await save_to_pb(url, web_title, infos)
        prompts = [get_info_sys_prompt, get_info_suffix_prompt, model]
        infos = await get_info(contents, link_dict, prompts, focus_dict, author, publish_date, _logger=wiseflow_logger)
        if infos:
            await save_to_pb(url, title, infos)
    await crawler.close()
 if __name__ == '__main__':
    sites = pb.read('sites', filter='activated=True')
    wiseflow_logger.info('execute all sites one time')
-    asyncio.run(main_process([site['url'] for site in sites]))
+    asyncio.run(main_process([site['url'].rstrip('/') for site in sites]))
--- a/core/llms/openai_wrapper.py
+++ b/core/llms/openai_wrapper.py
@ -1,6 +1,6 @@
 import os
 from openai import AsyncOpenAI as OpenAI
-from openai import RateLimitError
+# from openai import RateLimitError
 import asyncio
 base_url = os.environ.get('LLM_API_BASE', "")
@ -30,7 +30,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
    try:
        response = await client.chat.completions.create(messages=messages, model=model, **kwargs)
        resp = response.choices[0].message.content
-    except RateLimitError as e:
+    except Exception as e:
        if logger:
            logger.warning(f'{e}\nRetrying in 60 second...')
        else:
@ -44,13 +44,6 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
                logger.error(f'after many try, llm error: {response}')
            else:
                print(f'after many try, llm error: {response}')
    except Exception as e:
        if logger:
            logger.error(f'openai_llm error: {e}')
        else:
            print(f'openai_llm error: {e}')
    finally:
        semaphore.release()
--- a/core/scrapers/README.md
+++ b/core/scrapers/README.md
@ -1,10 +1,10 @@
-## 配置自定义 Crawl4ai 抓取 config
+# 配置自定义 Crawl4ai 抓取 config
 如果信源需要对应特殊的抓取配置，可以在 `core/scrapers/__init__.py` 中编辑对应的 crawler_config，并在 `custom_fetching_configs` 中注册。
-## 解析器（Scraper）
+# 解析器（Scraper）
-对于从网页内容中提取关注信息这一任务而言，直接把 html 编码送给 llm 并不是一个好主意。在该类型任务中，我们期待 llm 表现的类似人类，侧重点在于内容的理解，而不是 html 的解析。且不说直接送入 html 编码还会造成额外（非常大量）的 token 消耗和处理效率的降低。
+对于从网页内容中提取关注信息这一任务而言，直接把 html 编码送给 llm 并不是一个好主意，这会极大的增加提取任务的复杂度，引入更多干扰，并且产生额外（非常大量）的 token 消耗和处理效率的降低。
 将 html 转为易于意思理解的 markdown 是目前领域内比较通用的做法，这方面 Crawl4ai 提供了比较成熟的解决方案。
@ -12,50 +12,24 @@
 简单的说，解析器的作用就是将 html 编码转为 markdown 文本，并在这个过程中尽量过滤不必要信息（因为后一步是通过 llm 进行提炼，所以这一步要求不高），但也尽可能的保留 html 版面布局信息（这很重要）。
-### deep_scraper
+你并不需要通过解析器完成最终的信息提取，这个工作最终还是会使用 llm 完成——甚至在这之前我们还有一个被称为pre-process的步骤，它的主要功能是将待处理的文章 markdown 合理切块并将 url 和图片等进行合理的转化，事实上，这个模块是本项目的一大创新点——解析器只需要提供适合 pre-process 的 markdown(我们称为 raw_markdown)和有价值的图片列表即可。
-我们进一步发现，直接将 markdown 全文送入 llm 解析也存在缺陷。
+## 自定义解析器
-我在这里仅举一个例子：
+scraper 输入的 fetch_result 为一个 dict 或者是 crawl4ai 的 CrawlResult 对象，它包含如下字段：
-*很多网站喜欢在文章页面底部或者侧边栏加入推荐阅读板块，如果说这些推荐阅读只是链接列表还好，但事实上，很多时候他们还包括内容简介，这些简介的长度并不短，甚至有可能跟页面主体正文长度相当。这个时候如果我们将 markdown 整体扔给 llm，就会发现很难为llm 指定明确的工作策略——如果直接舍弃这些推荐阅读内容（先不说很难指定清晰的舍弃策略），但我们不能保证这里面不包含关注点内容；而如果保留这些内容，那么很可能 llm 就无法聚焦该页面的核心内容。或者 llm 会从这些简介中进行信息提取，但是这些简介对应额外的链接，这些后续的链接也会在后面进行爬取，这就可能带来提取出大量重复信息的情况。*
+- url: str, 网页的 url
 - html: str, 网页的 html 编码
 - cleaned_html: str, 经过清洗的 html 编码
 - markdown: str, 经过清洗的 markdown 编码
 - media: dict, 包含图片、视频、音频等媒体信息
 - metadata: dict, 包含网页的元数据，如标题、作者、发布时间等
-事实上，这里我们需要做的工作是分块，这有点类似 RAG 系统中的 chunk ，但不同的是，这里我们不需要考虑 chunk 的粒度，而是需要考虑页面布局的粒度。因为我们面对的是 html 页面，而不是 pdf、word……
+scraper 的输出为 ScraperResultData，具体见 `core/scrapers/scraper_data.py`。
-这一点很重要，我们需要按 html 的页面布局进行分块，而不是按语义逻辑分块！因为这影响了后续我们如何判断对不同的块采用合同提取策略。这也就是 wiseflow 为何不使用已有的文档智能工具，而是自写了 deep_scraper 的原因。
+## 注册自定义解析器
-当然，另一个选择是直接使用视觉大模型进行 layout 的识别，但实践中我们也发现，这需要能够获取不受干扰的网页截图，但这个操作会极大增加系统复杂度以及降低处理速度，且效果并不稳定（比如对于页面弹窗的处理……）。
+编写好 scraper 后，在 `core/scrapers/__init__.py` 中注册，参考：
 另一个不使用文档智能和视觉大模型的原因，是因为相比于 pdf、word 这种完全的非结构数据， html 编码本身就已经包含了全部 layout 信息，转化为 markdown 的过程实际上也保留了这些信息（通过\n # 这些符号），所以直接通过一定的规则对 markdown 进行分块并分别处理是可行的。
 这就是 wiseflow deep_scraper 的主要功能，归纳起来：1、按布局信息对markdown进行分块；2、分析每个块的类型，并按不同策略进行预处理，便于最终 llm 的提取。
 ### 注册自定义解析器
 wiseflow 的默认工作流程是： 
 *crawl4ai 获取 html，并初步转化为raw_markdown（此过程应用默认的 config） --> deep_scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。*
 如前所言，如果需要为特定信源配置特殊的 crawl4ai 获取策略（包括 raw_markdown 的转化策略），可以在 `core/scrapers/__init__.py` 中注册自定义的crawler_config；
 同时也可以为特定信源配置自定义的 scraper，自定义 scraper 的输入为crawl4ai的fetching_result，输出为将要被送入 llm 进行分析的链接字典和文本块列表。使用自定义 scraper 时，wiseflow 的处理流程为：
 *crawl4ai 获取 html，并初步转化为raw_markdown（此过程应用默认的 config或指定 config） --> 自定义 scraper 进行分块处理 --> 分块后的内容 送入 llm 进行信息提取。*
 自定义 scraper 可以内部调用deep_scraper作为后处理流程（如mp_scraper），也可以完全自定义全部流程。
 scraper 输入的 fetch_result 为一个 dict，格式如下：
 输出为 ScraperResultData，包含 url、content、links、images 四个字段。
 在 `core/scrapers/__init__.py` 中注册，参考：
 ```python
 from .mp import mp_scarper
--- a/core/scrapers/README_EN.md
+++ b/core/scrapers/README_EN.md
@ -1,15 +1,43 @@
-## Custom Scraper Registration
+# Configure Custom Crawl4ai Fetching Config
-Register in `core/scrapers/__init__.py`, for example:
+If a source requires special fetching configuration, you can edit the corresponding crawler_config in `core/scrapers/__init__.py` and register it in `custom_fetching_configs`.
 # Scraper
 For the task of extracting focused information from web content, directly feeding HTML code to LLM is not a good idea. This would greatly increase the complexity of extraction, introduce more interference, and result in additional (very large) token consumption and reduced processing efficiency.
 Converting HTML to markdown that is easy to understand semantically is a common practice in the field, and Crawl4ai provides a relatively mature solution for this.
 However, this refers to general cases. There is no one-size-fits-all solution. For certain specific sources, Crawl4ai's default parser may not work well, such as WeChat public account articles. In these cases, we need to customize scrapers for the sources.
 Simply put, the scraper's role is to convert HTML code to markdown text, filtering out unnecessary information during this process (since the next step is refinement through LLM, requirements here are not high), while preserving HTML layout information as much as possible (this is important).
 You don't need to complete the final information extraction through the scraper. This work will ultimately be done using LLM - in fact, before that we have a step called pre-process, whose main function is to reasonably segment the article markdown and properly transform URLs and images. In fact, this module is a major innovation point of this project - the scraper only needs to provide raw_markdown suitable for pre-process and a list of valuable images.
 ## Custom Scraper
 The fetch_result input to the scraper is either a dict or a Crawl4ai CrawlResult object containing the following fields:
 - url: str, the webpage URL
 - html: str, the webpage HTML code
 - cleaned_html: str, cleaned HTML code
 - markdown: str, cleaned markdown code
 - media: dict, contains media information like images, videos, audio etc.
 - metadata: dict, contains webpage metadata like title, author, publish time etc.
 The scraper output is ScraperResultData, see details in `core/scrapers/scraper_data.py`.
 ## Register Custom Scraper
 After writing the scraper, register it in `core/scrapers/__init__.py`, for example:
 ```python
-from .mp import mp_scarper
+from .mp import mp_scraper
-customer_scrapers = {'mp.weixin.qq.com': mp_scarper}
+custom_scrapers = {'mp.weixin.qq.com': mp_scraper}
 ```
-Note that the key should use the domain name, which can be obtained using `urllib.parse`:
+Note that the key uses the domain name, which can be obtained using `urllib.parse`:
 ```python
 from urllib.parse import urlparse
--- a/core/scrapers/init.py
+++ b/core/scrapers/init.py
@ -1,7 +1,8 @@
 from crawl4ai import CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from .mp_scraper import mp_scraper
-custom_scrapers = {}
+custom_scrapers = {'mp.weixin.qq.com': mp_scraper}
 custom_fetching_configs = {}
 md_generator = DefaultMarkdownGenerator(
--- a/core/scrapers/deep_scraper.py
+++ b/core/scrapers/deep_scraper.py
@ -1,225 +0,0 @@
 # -*- coding: utf-8 -*-
 # This program requires HTML to be first converted to properly formatted text while preserving link positions and structural information (like crawl4ai's html2text work);
 # The complete media list from the webpage needs to be extracted beforehand
 # Currently this script only handles images and links, other elements like downloads and videos are not processed yet, todo: process according to media list
 # action_dict needs to be extracted from raw html, which is not covered by this script
 import re
 from urllib.parse import urljoin
 common_file_exts = [
    'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
    'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
    'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
    'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
    'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
 ]
 common_tlds = [
    '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
    '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
    '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
    '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
 ]
 common_chars = ',.!;:，；：、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
 def normalize_url(url: str, base_url: str) -> str:
    url = url.strip()
    if url.startswith(('www.', 'WWW.')):
        _url = f"https://{url}"
    elif url.startswith('/www.'):
        _url = f"https:/{url}"
    elif url.startswith("//"):
        _url = f"https:{url}"
    elif url.startswith(('http://', 'https://')):
        _url = url
    elif url.startswith('http:/'):
        _url = f"http://{url[6:]}"
    elif url.startswith('https:/'):
        _url = f"https://{url[7:]}"
    else:
        _url = urljoin(base_url, url)
    _ss = _url.split('//')
    if len(_ss) == 2:
        return '//'.join(_ss)
    else:
        return _ss[0] + '//' + '/'.join(_ss[1:])
 def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]:
    link_dict = {}
    to_be_recognized_by_visual_llm = {}
    # for special url formate from crawl4ai 0.4.247
    raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
    # 处理图片标记 ![alt](src)
    i_pattern = r'(!\[(.*?)\]\((.*?)\))'
    matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
    for _sec, alt, src in matches:
        # 替换为新格式 §alt||src§
        raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
    def check_url_text(text) -> tuple[int, str]:
        score = 0
        _valid_len = len(text.strip())
        # 找到所有[part0](part1)格式的片段
        link_pattern = r'(\[(.*?)\]\((.*?)\))'
        matches = re.findall(link_pattern, text, re.DOTALL)
        for _sec, link_text, link_url in matches:
            # 处理 \"***\" 格式的片段
            quote_pattern = r'\"(.*?)\"'
            # 提取所有引号包裹的内容
            _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
            # 分离§§内的内容和后面的内容
            img_marker_pattern = r'§(.*?)\|\|(.*?)§'
            inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
            for alt, src in inner_matches:
                link_text = link_text.replace(f'§{alt}||{src}§', '')
            link_text = link_text.strip()
            if _title not in link_text:
                link_text = f"{_title} - {link_text}"
            link_text = link_text.strip()
            if not link_text and inner_matches:
                img_alt = inner_matches[0][0].strip()
                img_src = inner_matches[0][1].strip()
                if img_src and not img_src.startswith('#'):
                    img_src = normalize_url(img_src, base_url)
                    if not img_src:
                        link_text = img_alt
                    elif len(img_alt) > 2:
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = img_alt + _key
                    else:
                        if img_src not in to_be_recognized_by_visual_llm:
                            to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§"
                        _key = f"[img{len(link_dict)+1}]"
                        link_dict[_key] = img_src
                        link_text = to_be_recognized_by_visual_llm[img_src] + _key
                else:
                    link_text = img_alt
            real_url_pattern = r'<(.*?)>'
            real_url = re.search(real_url_pattern, link_url, re.DOTALL)
            if real_url:
                _url = real_url.group(1).strip()
            else:
                _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
            if not _url or _url.startswith(('#', 'javascript:')):
                text = text.replace(_sec, link_text, 1)
                continue
            score += 1
            _valid_len = _valid_len - len(_sec)
            url = normalize_url(_url, base_url)
            _key = f"[{len(link_dict)+1}]"
            link_dict[_key] = url
            text = text.replace(_sec, link_text + _key, 1)
            # 检查链接是否是常见文件类型或顶级域名
            # todo: 最后提取是否添加到 more_link时或者主流程时再处理
            """
            has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
            has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
            if has_common_ext or has_common_tld:
                continue
            """
        # 处理文本中的其他图片标记
        img_pattern = r'(§(.*?)\|\|(.*?)§)'
        matches = re.findall(img_pattern, text, re.DOTALL)
        remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
        remained_text_len = len(remained_text)
        for _sec, alt, src in matches:
            if not src or src.startswith('#') or src not in used_img:
                text = text.replace(_sec, alt, 1)
                continue
            img_src = normalize_url(src, base_url)
            if not img_src:
                text = text.replace(_sec, alt, 1)
            elif remained_text_len > 5 or len(alt) > 2:
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            elif any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            elif any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, alt + _key, 1)
            else:
                if img_src not in to_be_recognized_by_visual_llm:
                    to_be_recognized_by_visual_llm[img_src] = f"§{len(to_be_recognized_by_visual_llm)+1}§"
                _key = f"[img{len(link_dict)+1}]"
                link_dict[_key] = img_src
                text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1)
        # 处理文本中的"野 url"
        url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
        matches = re.findall(url_pattern, text)
        for url in matches:
            url = normalize_url(url, base_url)
            _key = f"[{len(link_dict)+1}]"
            link_dict[_key] = url
            text = text.replace(url, _key, 1)
            score += 1
            _valid_len = _valid_len - len(url)
        # 统计换行符数量
        newline_count = text.count(' * ')
        score += newline_count
        ratio = _valid_len/score if score != 0 else 999
        return ratio, text
    sections = raw_markdown.split('# ') # use '# ' to avoid # in url
    if len(sections) > 2:
        _sec = sections[0]
        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
        section_remain_len = len(section_remain)
        total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
        ratio = total_links / section_remain_len if section_remain_len != 0 else 1
        if ratio > 0.05:
            print('this is a navigation section, will be removed')
            print(ratio)
            print(section_remain)
            print('-' * 50)
            sections = sections[1:]
        _sec = sections[-1]
        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
        section_remain_len = len(section_remain)
        if section_remain_len < 198:
            print('this is a footer section, will be removed')
            print(section_remain_len)
            print(section_remain)
            print('-' * 50)
            sections = sections[:-1]
    links_parts = []
    contents = []
    for section in sections:
        ratio, text = check_url_text(section)
        if ratio < 70:
            print('this is a links part')
            print(ratio)
            print(text)
            print('-' * 50)
            links_parts.append(text)
        else:
            print('this is a content part')
            print(ratio)
            print(text)
            print('-' * 50)
            contents.append(text)
    return link_dict, links_parts, contents
--- a/core/utils/general_utils.py
+++ b/core/utils/general_utils.py
@ -1,10 +1,34 @@
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
 import os
 import re
 # import jieba
 from loguru import logger
 def normalize_url(url: str, base_url: str) -> str:
    url = url.strip()
    if url.startswith(('www.', 'WWW.')):
        _url = f"https://{url}"
    elif url.startswith('/www.'):
        _url = f"https:/{url}"
    elif url.startswith("//"):
        _url = f"https:{url}"
    elif url.startswith(('http://', 'https://')):
        _url = url
    elif url.startswith('http:/'):
        _url = f"http://{url[6:]}"
    elif url.startswith('https:/'):
        _url = f"https://{url[7:]}"
    else:
        _url = urljoin(base_url, url)
    _ss = _url.split('//')
    if len(_ss) == 2:
        return '//'.join(_ss)
    else:
        return _ss[0] + '//' + '/'.join(_ss[1:])
 def isURL(string):
    if string.startswith("www."):
        string = f"https://{string}"
--- a/5
+++ b/5
@ -1,8 +1,9 @@
 export LLM_API_KEY=""
 export LLM_API_BASE="https://api.siliconflow.cn/v1"
 export PRIMARY_MODEL="Qwen/Qwen2.5-32B-Instruct"
-#If your source pages are relatively simple with small amounts of information per page, considering cost and time (mainly time), Qwen2.5-32B-Instruct is recommended
+export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
-#If your source pages contain more links, have complex layouts, and you don't want to miss any information, DeepSeek-V2.5 is recommended
+#use a secondary model to excute the filtering task for the cost saving
 #if not set, will use the primary model to excute the filtering task
 export VL_MODEL="OpenGVLab/InternVL2-26B"
 export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password
--- a/test/get_info_test.py
+++ b/test/get_info_test.py
@ -1,92 +1,90 @@
 # -*- coding: utf-8 -*-
-import os, re, sys
+import os, sys
 import json
 import asyncio
 import time
 from prompts import *
 from datetime import datetime
-current_dir = os.path.dirname(os.path.abspath(__file__))
+# 将core目录添加到Python路径
-project_root = os.path.dirname(current_dir)  # get parent dir
+core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core')
-sys.path.append(project_root)
+sys.path.append(core_path)
-from core.llms.openai_wrapper import openai_llm as llm
+# 现在可以直接导入模块，因为core目录已经在Python路径中
 from scrapers import *
 from agents.get_info import pre_process
 from utils.general_utils import is_chinese
 from agents.get_info import get_author_and_publish_date, get_info, get_more_related_urls
 from agents.get_info_prompts import *
 benchmark_model = 'Qwen/Qwen2.5-72B-Instruct'
-models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct',  'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'internlm/internlm2_5-20b-chat']
+models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct',  'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']
-async def main(texts: list[str], link_dict: dict, record_file: str, sys_prompt: str, focus_points: list):
+
-    # first get more links
+async def main(sample: dict, include_ap: bool, prompts: list, focus_dict: dict, record_file: str):
-    print(f'sys_prompt: \n{sys_prompt}')
+    link_dict, links_parts, contents = sample['link_dict'], sample['links_part'], sample['contents']
-    benchmark_result = None
+    get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt = prompts
    for model in [benchmark_model] + models:
-        _texts = []
+        links_texts = []
-        for text in texts:
+        for _parts in links_parts:
-            _texts.extend(text.split('\n\n'))
+            links_texts.extend(_parts.split('\n\n'))
        contents = sample['contents'].copy()
        print(f"running {model} ...")
        start_time = time.time()
-        hallucination_times = 0
+        if include_ap:
-        text_batch = ''
+            author, publish_date = await get_author_and_publish_date(contents[0], model, test_mode=True)
-        cache = set()
+            get_ap_time = time.time() - start_time
-        while _texts:
+            print(f"get author and publish date time: {get_ap_time}")
            t = _texts.pop(0)
            text_batch = f'{text_batch}{t}\n\n'
            if len(text_batch) > 512 or len(_texts) == 0:
                content = f'<text>\n{text_batch}</text>\n\n{get_info_suffix}'
                result = await llm(
                    [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
                    model=model, temperature=0.1)
                print(f"llm output\n{result}\n")
                result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
                if result:
                    # 在result[-1]中找到所有类似[4]这样的片段
                    links = re.findall(r'\[\d+\]', result[-1])
                    for link in links:
                        if link not in text_batch:
                            hallucination_times += 1
                            print(f'\n**not in text_batch: {link}**\n')
                            continue
                        cache.add(link)
                text_batch = ''
        t1 = time.time()
        get_infos_time = t1 - start_time
        print(f"get more infos time: {get_infos_time}")
        print("*" * 12)
        print('\n\n')
        for link in cache:
            if link not in link_dict:
                print(f'\n**not in link_dict: {link}**\n')
        if model == benchmark_model:
            benchmark_result = cache.copy()
            diff = 'benchmark'
        else:
-            # 计算当前cache与benchmark的差异
+            author, publish_date = '', ''
-            missing_in_cache = len(benchmark_result - cache)  # benchmark中有但cache中没有的
+            get_ap_time = 0
-            extra_in_cache = len(cache - benchmark_result)    # cache中有但benchmark中没有的
+
        start_time = time.time()
        more_url = await get_more_related_urls(links_texts, link_dict, [get_link_sys_prompt, get_link_suffix_prompt, model], test_mode=True)
        get_more_url_time = time.time() - start_time
        print(f"get more related urls time: {get_more_url_time}")
        start_time = time.time()
        infos = await get_info(contents, link_dict, [get_info_sys_prompt, get_info_suffix_prompt, model], focus_dict, author, publish_date, test_mode=True)
        get_info_time = time.time() - start_time
        print(f"get info time: {get_info_time}")
        if model == benchmark_model:
            benchmark_result = more_url.copy()
            diff = f'benchmark: {len(benchmark_result)} results'
        else:
            missing_in_cache = len(benchmark_result - more_url)  # benchmark中有但cache中没有的
            extra_in_cache = len(more_url - benchmark_result)    # cache中有但benchmark中没有的
            total_diff = missing_in_cache + extra_in_cache
            diff = f'差异{total_diff}个(遗漏{missing_in_cache}个,多出{extra_in_cache}个)'
-        infos_to_record = '\n'.join(list(set(link_dict[link] for link in cache)))
+        related_urls_to_record = '\n'.join(more_url)
-
+        infos_to_record = [f"{fi['tag']}: {fi['content']}" for fi in infos]
        infos_to_record = '\n'.join(infos_to_record)
        with open(record_file, 'a') as f:
-            f.write(f"llm model: {model}\n")
+            f.write(f"model: {model}\n")
-            f.write(f"process time: {get_infos_time} s\n")
+            if include_ap:
-            f.write(f"bad generate times: {hallucination_times}\n")
+                f.write(f"get author and publish date time: {get_ap_time}\n")
                f.write(f"author: {author}\n")
                f.write(f"publish date: {publish_date}\n")
            f.write(f"get more related urls time: {get_more_url_time}\n")
            f.write(f"diff from benchmark: {diff}\n")
-            f.write(f"segments: \n{infos_to_record}\n")
+            f.write(f"get info time: {get_info_time}\n")
-            f.write("*" * 12)
+            f.write(f"related urls: \n{related_urls_to_record}\n")
            f.write(f"final result: \n{infos_to_record}\n")
            f.write('\n\n')
- 
+        print('\n\n')
 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--sample_dir', '-D', type=str, default='')
    parser.add_argument('--include_ap', '-I', type=bool, default=False)
    args = parser.parse_args()
    sample_dir = args.sample_dir
-
+    include_ap = args.include_ap
    if not os.path.exists(os.path.join(sample_dir, 'focus_point.json')):
        raise ValueError(f'{sample_dir} focus_point.json not found')
@ -97,27 +95,43 @@ if __name__ == '__main__':
        expl = item["explanation"]
        focus_statement = f"{focus_statement}//{tag}//\n"
        if expl:
-            focus_statement = f"{focus_statement}解释：{expl}\n"
+            if is_chinese(expl):
                focus_statement = f"{focus_statement}解释：{expl}\n"
            else:
                focus_statement = f"{focus_statement}Explanation: {expl}\n"
    focus_dict = {item["focuspoint"]: item["focuspoint"] for item in focus_points}
    date_stamp = datetime.now().strftime('%Y-%m-%d')
    if is_chinese(focus_statement):
        get_link_sys_prompt = get_link_system.replace('{focus_statement}', focus_statement)
        get_link_sys_prompt = f"今天的日期是{date_stamp}，{get_link_sys_prompt}"
        get_link_suffix_prompt = get_link_suffix
        get_info_sys_prompt = get_info_system.replace('{focus_statement}', focus_statement)
        get_info_sys_prompt = f"今天的日期是{date_stamp}，{get_info_sys_prompt}"
        get_info_suffix_prompt = get_info_suffix
    else:
        get_link_sys_prompt = get_link_system_en.replace('{focus_statement}', focus_statement)
        get_link_sys_prompt = f"today is {date_stamp}, {get_link_sys_prompt}"
        get_link_suffix_prompt = get_link_suffix_en
        get_info_sys_prompt = get_info_system_en.replace('{focus_statement}', focus_statement)
        get_info_sys_prompt = f"today is {date_stamp}, {get_info_sys_prompt}"
        get_info_suffix_prompt = get_info_suffix_en
-    get_info_system = get_info_system.replace('{focus_statement}', focus_statement)
+    prompts = [get_link_sys_prompt, get_link_suffix_prompt, get_info_sys_prompt, get_info_suffix_prompt]
    system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')}，{get_info_system}"
    focus_points = [item["focuspoint"] for item in focus_points]
    time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
    record_file = os.path.join(sample_dir, f'record-{time_stamp}.txt')
    with open(record_file, 'w') as f:
        f.write(f"focus statement: \n{focus_statement}\n\n")
-    for dirs in os.listdir(sample_dir):
+    for file in os.listdir(sample_dir):
-        if not os.path.isdir(os.path.join(sample_dir, dirs)):
+        if not file.endswith('_processed.json'):
            continue
-        _path = os.path.join(sample_dir, dirs)
+        sample = json.load(open(os.path.join(sample_dir, file), 'r'))
-        print(f'start testing {_path}')
+        if 'links_part' not in sample or 'link_dict' not in sample or 'contents' not in sample:
-        if 'sample.json' not in os.listdir(_path):
+            print(f'{file} not valid sample, skip')
            print(f'{dirs} sample.json not found, skip')
            continue
        sample = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
        with open(record_file, 'a') as f:
-            f.write(f"raw materials in: {dirs}\n\n")
+            f.write(f"raw materials: {file}\n\n")
-        asyncio.run(main(sample['links_part'], sample['link_dict'], record_file, system_prompt, focus_points))
+        print(f'start testing {file}')
        asyncio.run(main(sample, include_ap, prompts, focus_dict, record_file))
--- a/test/deep_scraper_test.py
+++ b/test/deep_scraper_test.py
@ -2,14 +2,16 @@ import os
 import sys
 import re
-current_dir = os.path.dirname(os.path.abspath(__file__))
+# 将core目录添加到Python路径
-project_root = os.path.dirname(current_dir)  # 获取父目录
+core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'core')
-sys.path.append(project_root)
+sys.path.append(core_path)
-from core.scrapers.deep_scraper import deep_scraper, common_chars
+# 现在可以直接导入模块，因为core目录已经在Python路径中
-from core.scrapers.mp_scraper import mp_scraper
+from scrapers import *
 from agents.get_info import pre_process
 def check_url_text(text):
    common_chars = ',.!;:，；：、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
    print(f"processing: {text}")
    left_bracket = text.find('[')
    right_paren = text.rfind(')')
@ -56,25 +58,75 @@ def check_url_text(text):
    for match in matches:
        print(match)
 async def main(html_sample, record_file):
    recognized_img_cache = {}
    parsed_url = urlparse(html_sample['url'])
    domain = parsed_url.netloc
    if domain in custom_scrapers:
        result = custom_scrapers[domain](html_sample)
        raw_markdown = result.content
        used_img = result.images
        title = result.title
        base_url = result.base
        author = result.author
        publish_date = result.publish_date
    else:
        raw_markdown = html_sample['markdown']
        media_dict = html_sample['media'] if html_sample['media'] else {}
        used_img = [d['src'] for d in media_dict.get('images', [])]
        title = ''
        base_url = ''
        author = ''
        publish_date = ''
    if not raw_markdown:
        print(f"no raw_markdown for {file}")
        return
    if not title:
        title = html_sample.get('title', '')
    if not base_url:
        base_url = html_sample.get('base', '')
    if not base_url:
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
    if not base_url.endswith('/'):
        base_url = base_url.rsplit('/', 1)[0] + '/'
    if not author:
        author = html_sample.get('author', '')
    if not publish_date:
        publish_date = html_sample.get('publish_date', '')
    link_dict, links_parts, contents, recognized_img_cache = await pre_process(raw_markdown, base_url, used_img, recognized_img_cache, test_mode=True)
    result = {
        "link_dict": link_dict,
        "links_part": links_parts,
        "contents": contents,
    }
    with open(record_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    print(f"pre process done, saved to {record_file}")
 if __name__ == '__main__':
    import argparse
    import time
    import json
    from urllib.parse import urlparse
    import asyncio
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_file', '-F', type=str, default='')
    parser.add_argument('--sample_dir', '-D', type=str, default='')
-    parser.add_argument('--test_string', '-T', type=str, default='')
+    parser.add_argument('--record_folder', '-R', type=str, default='')
    args = parser.parse_args()
    if args.test_string:
        check_url_text(args.test_string)
        exit()
    test_file = args.test_file
    sample_dir = args.sample_dir
-
+    record_folder = args.record_folder
    if record_folder:
        os.makedirs(record_folder, exist_ok=True)
    files = []
    if test_file:
        files.append(test_file)
@ -84,43 +136,9 @@ if __name__ == '__main__':
    for file in files:
        if not file.endswith('.json'): continue
        print(f"processing {file} ...")
-        try:
+        with open(file, 'r') as f:
-            with open(file, 'r') as f:
+            html_sample = json.load(f)
-                html_sample = json.load(f)
+        record_file = os.path.join(record_folder, f'{os.path.basename(file)}_processed.json')
-            _url = html_sample['url']
+        
-            if _url.startswith('https://mp.weixin.qq.com'):
+        asyncio.run(main(html_sample, record_file))
                result = mp_scraper(html_sample)
                raw_markdown = result.content
                used_img = result.images
            else:
                raw_markdown = html_sample['markdown']
                used_img = [d['src'] for d in html_sample['media']['images']]
        except Exception as e:
            print('sample format error, try to use craw4ai_fething.py to get sample')
            print(f"error: {e}")
            continue
        parsed_url = urlparse(_url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
        if not base_url.endswith('/'):
            # 如果路径不以 / 结尾，则去掉最后一个路径段
            base_url = base_url.rsplit('/', 1)[0] + '/'
        time_start = time.time()
        link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img)
        time_end = time.time()
        #print(f"time cost for html: {time_end - time_start}s")
        result = {
            "link_dict": link_dict,
            "links_part": links_part,
            "contents": contents,
        }
        record_folder = file.replace('.json', '')
        os.makedirs(record_folder, exist_ok=True)
        with open(os.path.join(record_folder, 'sample.json'), 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=4, ensure_ascii=False)
        #print("done")
        #print("*" * 12)
--- a/test/prompts.py
+++ b/test/prompts.py
@ -1,94 +0,0 @@
 get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，你的任务是从前到后仔细阅读文本，提取出与如下任一关注点相关的原文片段。关注点及其解释如下：
 {focus_statement}\n
 在进行提取时，请遵循以下原则：
 - 理解关注点的含义以及进一步的解释（如有），确保提取的内容与关注点强相关并符合解释（如有）的范围
 - 在满足上面原则的前提下，提取出全部可能相关的片段
 - 提取出的原文片段务必保留类似"[3]"这样的引用标记，后续的处理需要用到这些引用标记'''
 get_info_suffix = '''请逐条输出提取的原文片段，并整体用三引号包裹。三引号内除了提取出的原文片段外不要有其他内容，如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
 如下是输出格式示例：：
 """
 原文片段1
 原文片段2
 ...
 """'''
 text_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下：
 {focus_statement}\n
 在提炼摘要时，请遵循以下原则：
 - 理解每个关注点的含义以及进一步的解释（如有），确保摘要与关注点强相关并符合解释（如有）的范围
 - 摘要应当详实、充分
 - 摘要信息务必忠于原文'''
 text_info_suffix = '''请对关注点逐一生成摘要，不要遗漏任何关注点，如果网页文本与关注点无关，可以对应输出"NA"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
 //关注点1//
 摘要1
 //关注点2//
 摘要2
 //关注点3//
 NA
 ...
 """'''
 text_link_system = '''你将被给到数行格式为"<编号>//内容//"的文本，你的任务是逐条分析这些文本，并分别与如下关注点之一相关联。关注点列表及其解释如下：
 {focus_statement}\n
 在进行关联分析时，请遵循以下原则：
 - 理解每个关注点的含义
 - 如果关注点有进一步的解释，确保提取的内容符合这些解释的范围'''
 text_link_suffix = '''请分行逐条输出结果，每一条的输出格式为"<编号>//关注点名称//"，如果某条内容不与任何关注点相关，请输出"<编号>//NA//"。输出结果整体用三引号包裹，三引号内不要有其他内容。如下是输出格式示例：
 """
 <t1>//关注点1名称//
 <t2>//关注点2名称//
 <t3>//NA//
 ...
 """'''
 text_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA"
 text_ap_suffix = '''Please output the extracted information in the following format(output only the result, no other content):
 """source or article author (use "NA" if this information cannot be extracted)//extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)"""'''
 verified_system = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹，网页文本则用<text></text>包裹。请遵循如下工作流程:
 1、尝试找出网页文本中所有与信息对应的文本片段（可能有多处）；
 2、基于这些片段给出是否相符的最终结论，最终结论仅为“是”或“否”'''
 verified_suffix = '先输出找到的所有文本片段，再输出最终结论（仅为是或否）'
 image_info_system = '''作为信息提取助手，你的任务是从给定的网页截屏中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下：
 {focus_statement}\n
 在进行信息提取时，请遵循以下原则：
 - 理解每个兴趣点的含义，确保提取的内容与之相关。
 - 如果兴趣点有进一步的解释，确保提取的内容符合这些解释的范围。
 - 忠于原文，你的任务是从网页截屏中识别和提取与各个兴趣点相关的信息，并不是总结和提炼。'''
 image_info_suffix = '''如果网页截屏中包含兴趣点相关的内容，请按照以下json格式输出提取的信息（文本中可能包含多条有用信息，请不要遗漏）：
 [{"focus": 兴趣点名称, "content": 提取的内容}]
 示例：
 [{"focus": "旅游景点", "content": "北京故宫，地址：北京市东城区景山前街4号，开放时间：8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
 如果截屏中不包含任何与兴趣点相关的信息或者你判断这是一个文章列表页面，请仅输出：[]。'''
 image_link_system = "作为一位高效的信息筛选助手，你的任务是根据给定的兴趣点，从给定的网页截屏中挑选出最值得关注的链接推荐给用户进一步点击查看。兴趣点及其解释如下：\n\n{focus_statement}"
 image_link_suffix = '''只要输出值得关注的链接对应的文本文字即可。按一行一条的格式输出，最终输出的列表整体用三引号包裹，三引号内不要有其他内容，如下是输出格式示例：
 """
 链接文字1
 链接文字2
 ...
 """'''
 image_ap_system = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage screenshot. If the screenshot does not contain a particular piece of information, please replace it with NA"
 image_ap_suffix = '''Please output the extracted information in the following JSON format:
 {"source": source or article author (use "NA" if this information cannot be found), "publish_date": publication date (keep only the year, month, and day; use "NA" if this information cannot be found)}'''
 image_system = "提取图片中的所有文字，如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等，则输出NA。注意请仅输出提取出的文字，不要输出别的任何内容。"
 image_system_en = "Extract all text from the image. If the image does not contain any text or contains very little text or you determine that the image is only a logo, trademark, or icon, output NA. Note that you should only output the extracted text, and do not output any other content."