From 77c3914d127aac13c90103f11c7a499286aef34c Mon Sep 17 00:00:00 2001
From: bigbrother666sh <zeming.zhao@gmail.com>
Date: Thu, 16 Jan 2025 10:56:57 +0800
Subject: [PATCH] method to seperate links area from content

---
 core/llms/openai_wrapper.py   |   2 +-
 core/scrapers/deep_scraper.py | 108 +++++++++++++-------
 core/scrapers/mp_scraper.py   |  24 ++++-
 test/deep_scraper_test.py     |  18 +---
 test/get_info_test.py         | 185 +++++++---------------------------
 test/prompts.py               |  20 ++--
 6 files changed, 143 insertions(+), 214 deletions(-)

diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py
index c632c1e..4b40cf5 100644
--- a/core/llms/openai_wrapper.py
+++ b/core/llms/openai_wrapper.py
@@ -54,7 +54,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
     finally:
         semaphore.release()
 
-    if logger:
+    if logger and resp:
         logger.debug(f'result:\n {response.choices[0]}')
         logger.debug(f'usage:\n {response.usage}')
     return resp
diff --git a/core/scrapers/deep_scraper.py b/core/scrapers/deep_scraper.py
index b9135b6..a55bac7 100644
--- a/core/scrapers/deep_scraper.py
+++ b/core/scrapers/deep_scraper.py
@@ -49,34 +49,34 @@ def normalize_url(url: str, base_url: str) -> str:
         return _ss[0] + '//' + '/'.join(_ss[1:])
 
 
-def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], dict]:
+def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]:
     link_dict = {}
     to_be_recognized_by_visual_llm = {}
-    def check_url_text(text):
-        # text = text.strip()
-        # for special url formate from crawl4ai 0.4.247
-        text = re.sub(r'<javascript:.*?>', '<javascript:>', text).strip()
+    # for special url formate from crawl4ai 0.4.247
+    raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
 
-        # 处理图片标记 ![alt](src)
-        img_pattern = r'(!\[(.*?)\]\((.*?)\))'
-        matches = re.findall(img_pattern, text)
-        for _sec,alt, src in matches:
-            # 替换为新格式 §alt||src§
-            text = text.replace(_sec, f'§{alt}||{src}§', 1)  
-            
+    # 处理图片标记 ![alt](src)
+    i_pattern = r'(!\[(.*?)\]\((.*?)\))'
+    matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
+    for _sec, alt, src in matches:
+        # 替换为新格式 §alt||src§
+        raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
+
+    def check_url_text(text) -> tuple[int, str]:
+        score = 0
+        _valid_len = len(text.strip())
         # 找到所有[part0](part1)格式的片段
         link_pattern = r'(\[(.*?)\]\((.*?)\))'
-        matches = re.findall(link_pattern, text)
+        matches = re.findall(link_pattern, text, re.DOTALL)
         for _sec, link_text, link_url in matches:
-            print("found link sec:", _sec)
             # 处理 \"***\" 格式的片段
             quote_pattern = r'\"(.*?)\"'
             # 提取所有引号包裹的内容
-            _title = ''.join(re.findall(quote_pattern, link_url))
+            _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
 
             # 分离§§内的内容和后面的内容
             img_marker_pattern = r'§(.*?)\|\|(.*?)§'
-            inner_matches = re.findall(img_marker_pattern, link_text)
+            inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
             for alt, src in inner_matches:
                 link_text = link_text.replace(f'§{alt}||{src}§', '')
             link_text = link_text.strip()
@@ -113,20 +113,21 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
                     link_text = img_alt
 
             real_url_pattern = r'<(.*?)>'
-            real_url = re.search(real_url_pattern, link_url)
+            real_url = re.search(real_url_pattern, link_url, re.DOTALL)
             if real_url:
                 _url = real_url.group(1).strip()
             else:
-                _url = re.sub(quote_pattern, '', link_url).strip()
+                _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
 
             if not _url or _url.startswith(('#', 'javascript:')):
                 text = text.replace(_sec, link_text, 1)
                 continue
+            score += 1
+            _valid_len = _valid_len - len(_sec)
             url = normalize_url(_url, base_url)
             _key = f"[{len(link_dict)+1}]"
             link_dict[_key] = url
             text = text.replace(_sec, link_text + _key, 1)
-
             # 检查链接是否是常见文件类型或顶级域名
             # todo: 最后提取是否添加到 more_link时或者主流程时再处理
             """
@@ -137,17 +138,17 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
             """
         # 处理文本中的其他图片标记
         img_pattern = r'(§(.*?)\|\|(.*?)§)'
-        matches = re.findall(img_pattern, text)
-        remained_text = re.sub(img_pattern, '', text).strip()
-        remained_text_len = len(remained_text )
+        matches = re.findall(img_pattern, text, re.DOTALL)
+        remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
+        remained_text_len = len(remained_text)
         for _sec, alt, src in matches:
-            if not src or src.startswith('#'):
+            if not src or src.startswith('#') or src not in used_img:
                 text = text.replace(_sec, alt, 1)
                 continue
             img_src = normalize_url(src, base_url)
             if not img_src:
                 text = text.replace(_sec, alt, 1)
-            elif src not in used_img or remained_text_len > 5 or len(alt) > 2:
+            elif remained_text_len > 5 or len(alt) > 2:
                 _key = f"[img{len(link_dict)+1}]"
                 link_dict[_key] = img_src
                 text = text.replace(_sec, alt + _key, 1)
@@ -165,7 +166,6 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
                 _key = f"[img{len(link_dict)+1}]"
                 link_dict[_key] = img_src
                 text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1)
-
         # 处理文本中的"野 url"
         url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
         matches = re.findall(url_pattern, text)
@@ -174,22 +174,52 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
             _key = f"[{len(link_dict)+1}]"
             link_dict[_key] = url
             text = text.replace(url, _key, 1)
+            score += 1
+            _valid_len = _valid_len - len(url)
+        # 统计换行符数量
+        newline_count = text.count(' * ')
+        score += newline_count
+        ratio = _valid_len/score if score != 0 else 999
 
-        return text
+        return ratio, text
 
     sections = raw_markdown.split('# ') # use '# ' to avoid # in url
-    texts = []
-    for i, section in enumerate(sections):
-        # filter the possible navigate section and footer section
-        section_remain = re.sub(r'\[.*?]\(.*?\)', '', section).strip()
+    if len(sections) > 2:
+        _sec = sections[0]
+        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
         section_remain_len = len(section_remain)
-        total_links = len(re.findall(r'\[.*?]\(.*?\)', section))
-        print(f"section {i}")
-        print(f"ratio: {total_links/section_remain_len}")
+        total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
+        ratio = total_links / section_remain_len if section_remain_len != 0 else 1
+        if ratio > 0.05:
+            print('this is a navigation section, will be removed')
+            print(ratio)
+            print(section_remain)
+            print('-' * 50)
+            sections = sections[1:]
+        _sec = sections[-1]
+        section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
+        section_remain_len = len(section_remain)
+        if section_remain_len < 198:
+            print('this is a footer section, will be removed')
+            print(section_remain_len)
+            print(section_remain)
+            print('-' * 50)
+            sections = sections[:-1]
 
-        processed_p = [check_url_text(p) for p in section.split('\n\n')]
-        processed_p = [p for p in processed_p if p.strip()]
-        texts.append('\n\n'.join(processed_p))
-
-    return link_dict, texts, to_be_recognized_by_visual_llm
-        
\ No newline at end of file
+    links_parts = []
+    contents = []
+    for section in sections:
+        ratio, text = check_url_text(section)
+        if ratio < 70:
+            print('this is a links part')
+            print(ratio)
+            print(text)
+            print('-' * 50)
+            links_parts.append(text)
+        else:
+            print('this is a content part')
+            print(ratio)
+            print(text)
+            print('-' * 50)
+            contents.append(text)
+    return link_dict, links_parts, contents
diff --git a/core/scrapers/mp_scraper.py b/core/scrapers/mp_scraper.py
index 8957a39..0dfff67 100644
--- a/core/scrapers/mp_scraper.py
+++ b/core/scrapers/mp_scraper.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from bs4 import BeautifulSoup
 import re
 from crawl4ai import CrawlResult
@@ -12,10 +14,21 @@ text_elements = {
 }
 
 
-def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData:
-    url = fetch_result.url
-    raw_html = fetch_result.html
-    cleaned_html = fetch_result.cleaned_html
+def mp_scraper(fetch_result: CrawlResult | dict) -> ScraperResultData:
+    if isinstance(fetch_result, dict):
+        url = fetch_result['url']
+        raw_html = fetch_result['html']
+        cleaned_html = fetch_result['cleaned_html']
+        raw_markdown = fetch_result['markdown']
+        media = fetch_result['media']['images']
+    elif isinstance(fetch_result, CrawlResult):
+        url = fetch_result.url
+        raw_html = fetch_result.html
+        cleaned_html = fetch_result.cleaned_html
+        raw_markdown = fetch_result.markdown
+        media = fetch_result.media['images']
+    else:
+        raise TypeError('fetch_result must be a CrawlResult or a dict')
 
     content = ''
     images = []
@@ -232,7 +245,8 @@ def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData:
         else:
             author = None
             publish_date = None
-            content = fetch_result['markdown']
+            content = raw_markdown
+            images = [d['src'] for d in media]
             
     elif num_sub_divs >= 2:
         # 2.2 如果包含两个及以上子块
diff --git a/test/deep_scraper_test.py b/test/deep_scraper_test.py
index 8a4435e..d7cb1f0 100644
--- a/test/deep_scraper_test.py
+++ b/test/deep_scraper_test.py
@@ -85,26 +85,18 @@ if __name__ == '__main__':
     for file in files:
         if not file.endswith('.json'): continue
 
-        #print(f"processing {file} ...")
+        print(f"processing {file} ...")
         try:
             with open(file, 'r') as f:
                 html_sample = json.load(f)
             _url = html_sample['url']
             if _url.startswith('https://mp.weixin.qq.com'):
                 result = mp_scraper(html_sample)
-                #print(f'url: {result.url}')
-                #print(f'content: {result.content}')
-                #print(f'links: {result.links}')
-                #print(f'author: {result.author}')
-                #print(f'publish_date: {result.publish_date}')
-                #print(f'images: {len(result.images)}')
-                #for img in result.images:
-                #    print(img)
                 raw_markdown = result.content
                 used_img = result.images
             else:
                 raw_markdown = html_sample['markdown']
-                used_img = {d['src']: d['alt'] for d in html_sample['media']['images']}
+                used_img = [d['src'] for d in html_sample['media']['images']]
         except Exception as e:
             print('sample format error, try to use craw4ai_fething.py to get sample')
             print(f"error: {e}")
@@ -117,14 +109,14 @@ if __name__ == '__main__':
             base_url = base_url.rsplit('/', 1)[0] + '/'
 
         time_start = time.time()
-        link_dict, texts, to_be_recognized_by_visual_llm = deep_scraper(raw_markdown, base_url, used_img)
+        link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img)
         time_end = time.time()
         #print(f"time cost for html: {time_end - time_start}s")
 
         result = {
             "link_dict": link_dict,
-            "texts": texts,
-            "to_be_recognized_by_visual_llm": to_be_recognized_by_visual_llm,
+            "links_part": links_part,
+            "contents": contents,
         }
         record_folder = file.replace('.json', '')
         os.makedirs(record_folder, exist_ok=True)
diff --git a/test/get_info_test.py b/test/get_info_test.py
index f82d073..a3bec50 100644
--- a/test/get_info_test.py
+++ b/test/get_info_test.py
@@ -4,168 +4,62 @@ import json
 import asyncio
 import time
 from prompts import *
-# prompt 要加上今天是…………
+from datetime import datetime
+
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.dirname(current_dir)  # get parent dir
 sys.path.append(project_root)
 
 from core.llms.openai_wrapper import openai_llm as llm
 
-models = ['Qwen/Qwen2.5-14B-Instruct',  'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'Qwen/Qwen2.5-72B-Instruct']
+models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct',  'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']
 
-async def main(link_dict: dict, text: str, record_file: str, prompts: list, focus_points: list):
+async def main(texts: list[str], record_file: str, sys_prompt: str, focus_points: list):
     # first get more links
-    _to_be_processed = []
-    link_map = {}
-    for i, (url, des) in enumerate(link_dict.items()):
-        des = des.replace('\n', ' ')
-        _to_be_processed.append(f'<t{i+1}>//{des}//')
-        link_map[f'<t{i+1}'] = url
-
+    judge_text = ''.join(texts)
     for model in models:
+        _texts = texts.copy()
         print(f"running {model} ...")
         start_time = time.time()
-        get_more_links_hallucination_times = 0
-        more_links = set()
+        hallucination_times = 0
         text_batch = ''
-        for t in _to_be_processed:
-            text_batch = f'{text_batch}{t}\n'
-            if len(text_batch) > 2048:
-                content = f'<text>\n{text_batch}</text>\n\n{text_link_suffix}'
+        cache = []
+        while _texts:
+            t = _texts.pop(0)
+            text_batch = f'{text_batch}{t}# '
+            if len(text_batch) > 100 or len(_texts) == 0:
+                content = f'<text>\n{text_batch}</text>\n\n{get_info_suffix}'
                 result = await llm(
-                    [{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
+                    [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
                     model=model, temperature=0.1)
-                print(f"llm output\n{result}")
+                #print(f"llm output\n{result}")
                 text_batch = ''
                 result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-                result = result[-1]
-                for item in result.split('\n'):
-                    if not item:
-                        continue
-                    segs = item.split('>')
-                    if len(segs) != 2:
-                        get_more_links_hallucination_times += 1
-                        continue
-                    _index, focus = segs
-                    _index = _index.strip()
-                    focus = focus.strip().strip('//')
-                    if focus == 'NA':
-                        continue
-                    if focus not in focus_points or _index not in link_map:
-                        get_more_links_hallucination_times += 1
-                        continue
-                    more_links.add(link_map[_index])
-                
-        if text_batch:
-            content = f'<text>\n{text_batch}</text>\n\n{text_link_suffix}'
-            result = await llm(
-                [{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
-                model=model, temperature=0.1)
-            print(f"llm output\n{result}")
-            result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-            result = result[-1]
-            for item in result.split('\n'):
-                if not item:
-                    continue
-                segs = item.split('>')
-                if len(segs) != 2:
-                    get_more_links_hallucination_times += 1
-                    continue
-                _index, focus = segs
-                _index = _index.strip()
-                focus = focus.strip().strip('//')
-                if focus == 'NA':
-                    continue
-                if focus not in focus_points or _index not in link_map:
-                    get_more_links_hallucination_times += 1
-                    continue
-                more_links.add(link_map[_index])
+                if result: cache.append(result[-1])
 
-        t1 = time.time()
-        get_more_links_time = t1 - start_time
-        print(f"get more links time: {get_more_links_time}")
-
-        # second get more infos
-        lines = text.split('\n')
-        cache = set()
-        text_batch = ''
-        for line in lines:
-            text_batch = f'{text_batch}{line}\n'
-            if len(text_batch) > 5000:
-                #print(f"text_batch\n{text_batch}")
-                content = f'<text>\n{text_batch}</text>\n\n{text_info_suffix}'
-                result = await llm(
-                    [{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}],
-                    model=model, temperature=0.1)
-                print(f"llm output\n{result}")
-                text_batch = ''
-                result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-                cache.add(result[-1])
-
-        if text_batch:
-            #print(f"text_batch\n{text_batch}")
-            content = f'<text>\n{text_batch}</text>\n\n{text_info_suffix}'
-            result = await llm(
-                [{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}],
-                model=model, temperature=0.1)
-            print(f"llm output\n{result}")
-            result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
-            cache.add(result[-1])
-
-        get_infos_hallucination_times = 0
         infos = []
         for item in cache:
             segs = item.split('//')
-            i = 0
-            while i < len(segs) - 1:
-                focus = segs[i].strip()
-                if not focus:
-                    i += 1
-                    continue
-                if focus not in focus_points:
-                    get_infos_hallucination_times += 1
-                    i += 1
-                    continue
-                content = segs[i+1].strip().strip('摘要').strip(':').strip('：')
-                i += 2
-                if content and content != 'NA':
-                    infos.append(f'{focus}: {content}')
-            """
-            maybe can use embedding retrieval to judge
-            """
-        t2 = time.time()
-        get_infos_time = t2 - t1
+            infos.extend([s.strip() for s in segs if s.strip()])
+        for content in infos:
+            if content not in judge_text:
+                print(f'not in raw content:\n{content}')
+                hallucination_times += 1
+
+        t1 = time.time()
+        get_infos_time = t1 - start_time
         print(f"get more infos time: {get_infos_time}")
-
-        # get author and publish date from text
-        if len(text) > 1024:
-            usetext = f'{text[:500]}......{text[-500:]}'
-        else:
-            usetext = text
-        content = f'<text>\n{usetext}\n</text>\n\n{text_ap_suffix}'
-        llm_output = await llm([{'role': 'system', 'content': text_ap_system}, {'role': 'user', 'content': content}],
-                               model=model, max_tokens=50, temperature=0.1)
-        print(f"llm output: {llm_output}")
-        ap_ = llm_output.strip().strip('"')
-
         print("*" * 12)
         print('\n\n')
 
-        more_links_to_record = [f'{link_dict[link]}:{link}' for link in more_links]
-        more_links_to_record = '\n'.join(more_links_to_record)
         infos_to_record = '\n'.join(infos)
 
         with open(record_file, 'a') as f:
             f.write(f"llm model: {model}\n")
-            f.write(f"get more links time: {get_more_links_time} s\n")
-            f.write(f"bad generate times during get more links: {get_more_links_hallucination_times}\n")
-            f.write(f"get more infos time: {get_infos_time} s\n")
-            f.write(f"bad generate times during get more infos: {get_infos_hallucination_times}\n")
-            f.write(f"total more links: {len(more_links)}\n")
-            f.write(f"total infos: {len(infos)}\n")
-            f.write(f"author and publish time: {ap_}\n")
-            f.write(f"infos: \n{infos_to_record}\n")
-            f.write(f"more links: \n{more_links_to_record}\n")
+            f.write(f"process time: {get_infos_time} s\n")
+            f.write(f"bad generate times: {hallucination_times}\n")
+            f.write(f"total segments: {len(infos)}\n")
+            f.write(f"segments: \n{infos_to_record}\n")
             f.write("*" * 12)
             f.write('\n\n')
  
@@ -190,9 +84,8 @@ if __name__ == '__main__':
         if expl:
             focus_statement = f"{focus_statement}解释：{expl}\n"
 
-    get_info_system = text_info_system.replace('{focus_statement}', focus_statement)
-    get_link_system = text_link_system.replace('{focus_statement}', focus_statement)
-    prompts = [get_link_system, get_info_system]
+    get_info_system = get_info_system.replace('{focus_statement}', focus_statement)
+    system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')}，{get_info_system}"
     focus_points = [item["focuspoint"] for item in focus_points]
 
     time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
@@ -205,17 +98,11 @@ if __name__ == '__main__':
             continue
         _path = os.path.join(sample_dir, dirs)
         print(f'start testing {_path}')
-        if 'sample_recognized.json' not in os.listdir(_path):
-            print(f'{dirs} sample_recognized.json not found, use sample.json instead')
-            if 'sample.json' not in os.listdir(_path):
-                print(f'{dirs} sample.json not found, skip')
-                continue
-            sample_recognized = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
-        else:
-            sample_recognized = json.load(open(os.path.join(_path, 'sample_recognized.json'), 'r'))
-        
-        link_dict = sample_recognized['link_dict']
-        text = sample_recognized['text']
+        if 'sample.json' not in os.listdir(_path):
+            print(f'{dirs} sample.json not found, skip')
+            continue
+        sample = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
+
         with open(record_file, 'a') as f:
             f.write(f"raw materials in: {dirs}\n\n")
-        asyncio.run(main(link_dict, text, record_file, prompts, focus_points))
+        asyncio.run(main(sample['texts'], record_file, system_prompt, focus_points))
diff --git a/test/prompts.py b/test/prompts.py
index 4cc8caa..0622f3c 100644
--- a/test/prompts.py
+++ b/test/prompts.py
@@ -1,15 +1,21 @@
 
-get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，你的任务是从前到后仔细阅读文本，并提取出所有与如下关注点之一相关的部分。关注点列表及其解释如下：
+get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，你的任务是从前到后仔细阅读文本，并摘抄与如下关注点相关的原文片段。关注点及其解释如下：
 
 {focus_statement}\n
 在进行提取时，请遵循以下原则：
-- 理解每个关注点的含义以及进一步的解释（如有），确保提取的内容与关注点强相关并符合解释（如有）的范围
-- 有必要的话，可以连同相关的上下文一并提取，从而保证提取出的内容信息完备、意思完整'''
+- 理解关注点的含义以及进一步的解释（如有），确保提取的内容与关注点强相关并符合解释（如有）的范围
+- 在满足上面原则的前提下，摘抄出全部相关片段
+- 摘抄出的原文片段务必保持原文原样，包括标点符号都不要更改，尤其注意保留类似"[3]"这样的引用标记'''
 
-get_info_suffix = '''如果网页文本中包含关注点相关的部分，请按照以下json格式输出：
-"""{"focus": 关注点, "content": 提取的内容}"""
-
-如果有多个相关部分，请逐条输出，每一条都用三引号包裹，三引号内不要有其他内容。'''
+get_info_suffix = '''请将摘抄出的原文片段用"//"分隔，并整体用三引号包裹后输出。三引号内不要有其他内容，如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
+如下是输出格式示例：：
+"""
+原文片段1
+//
+原文片段2
+//
+...
+"""'''
 
 text_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本，请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下：