deep scraper

2025-01-23 02:20:20 +08:00 · 2025-01-02 10:14:33 +08:00 · 2025-01-02 10:14:33 +08:00 · dc8391c357
commit dc8391c357
parent ae7b5d7f65
32 changed files with 5840 additions and 742 deletions
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@ -263,6 +263,9 @@ text2
        return cache
    
    async def _extract_info_from_img(self, text, link_dict) -> str:
+        if not self.vl_model:
+            self.logger.warning("vl model not found, skip extracting info from img")
+            return text
        cache = {}
        pattern = r'<img>\[url\d+\]'
        matches = re.findall(pattern, text)
@ -294,7 +297,6 @@ text2
        final_result = set()
        for item in raw_result:
            if '[url' not in item:
-                self.logger.warning(f"bad generate result: {item}")
                continue
            url_tags = re.findall(r'\[url\d+]', item)
            if not url_tags:
--- a/core/general_process.py
+++ b/core/general_process.py
@ -60,7 +60,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
    context.page.on('dialog', handle_dialog)

    context.log.info('successfully finish fetching')
-
+    wiseflow_logger.info(context.request.url)
+    
    html = await context.page.inner_html('head')
    soup = BeautifulSoup(html, 'html.parser')
    web_title = soup.find('title')
--- a/core/requirements.txt
+++ b/core/requirements.txt
@ -5,4 +5,4 @@ pydantic
 json_repair==0.*
 beautifulsoup4
 requests
-crawlee[playwright]
+crawl4ai
--- a/core/scrapers/README.md
+++ b/core/scrapers/README.md
@ -2,7 +2,18 @@

 ## 概述

-wiseflow 致力于利用大模型的理解和分析能力自动化爬虫与信息提取流程，即所谓的“爬查一体”架构， 因此 scraper 在 wiseflow 产品语境内与传统定位稍有不同，在这里它仅是指将爬虫获取的已渲染的 html 转化为便于大模型“阅读和理解”的数据前处理过程。
+scraper 在 wiseflow 产品语境中指从已渲染的html提取出便于大模型“阅读和理解”的部分的过程。虽然wiseflow的产品逻辑是“爬查一体”，即致力于用大模型的理解和分析能力直接从html编码中提取出用户需要的信息，但实际操作中，我们发现：
+
+- 直接把 html 编码送给大模型，哪怕是已经去除了不必要的标签，依然会带有大量的干扰信息，比如：列表页面中的summary容易被 llm 提取为 info，这样后续再进入详情页面，就可能造成重复提取；再比如文章页面中的链接 list，也很容易被 llm 提取为 info。而判定一个页面是列表页面还是文章页面，这非常困难，且很多新闻类网站喜欢一半是文章正文一半是“阅读更多”的列表页面这样的布局，这种情况下无论是舍弃正文还是舍弃列表，都是比较粗糙的方案；
+
+- 对于使用视觉大模型进行布局分析，虽然理论上可以解决上述问题，但实际操作中，获取无干扰的网页截图并不容易，需要考虑各种情况，比如 cookie 警告等（很多 cookie 警告并不是简单的弹窗）、窗口大小设定以及因此带来的可能的多轮处理等；
+
+另外，在送入大模型之前，先进行一道“提纯”处理还能够带来处理速度以及 token 消耗费用上的好处。因此 scraper 是十分有必要的。
+
+
+
+
+

 它的输入是 已渲染的 html 编码（配合网站的 base_url），输出是三个值：

--- a/core/scrapers/general_scraper.py
+++ b/core/scrapers/general_scraper.py
@ -1,290 +0,0 @@
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
-import os
-
-def general_scraper(html: str, base_url: str) -> tuple[dict, dict, str]:
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # remove common elements
-    for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']:
-        elements = soup.select(selector)
-        for element in elements:
-            element.decompose()
-
-    action_dict = {}
-    # handle form elements
-    for form in soup.find_all('form', recursive=True):
-        form_dict = {}
-        for input_elem in form.find_all('input'):
-            input_type = input_elem.get('type', 'text')
-            input_name = input_elem.get('name', f'input_{len(action_dict)}')
-            input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
-            input_dict = {
-                "type": input_type,
-                "values": [input_value] if input_value else []
-            }
-
-            # handle datalist
-            if input_elem.get('list'):
-                datalist = soup.find('datalist', id=input_elem['list'])
-                if datalist:
-                    options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
-                    input_dict = {
-                        "type": "text",
-                        "values": [f"one of followings: {options}"]
-                    }
-                
-            form_dict[input_name] = input_dict
-            
-        for select in form.find_all('select'):
-            select_name = select.get('name', f'select_{len(form_dict)}')
-            options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
-            form_dict[select_name] = {
-                "type": "select",
-                "values": options
-            }
-                
-        for textarea in form.find_all('textarea'):
-            textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
-            form_dict[textarea_name] = {
-                "type": "textarea", 
-                "values": [textarea.text.strip()]
-            }
-                
-        if form_dict:
-            form_id = form.get('id', f'form_{len(action_dict)}')
-            action_dict[form_id] = form_dict
-            
-        form.decompose()
-        
-    # handle input elements that are not in any form
-    for input_elem in soup.find_all('input', recursive=True):
-        if input_elem.find_parent('form') is None:
-            # check if the input is associated with a form by form attribute
-            form_ids = input_elem.get('form', '').split()
-                
-            # handle input element
-            input_type = input_elem.get('type', 'text')
-            input_name = input_elem.get('name', f'input_{len(action_dict)}')
-            input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
-            input_dict = {
-                "type": input_type,
-                "values": [input_value] if input_value else []
-            }
-
-            # handle datalist
-            if input_elem.get('list'):
-                datalist = soup.find('datalist', id=input_elem['list'])
-                if datalist:
-                    options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
-                    input_dict = {
-                        "type": "text",
-                        "values": [f"one of followings: {options}"]
-                    }
-                
-            # decide the placement of the input element based on form attribute
-            if form_ids:
-                for form_id in form_ids:
-                    if form_id in action_dict:
-                        action_dict[form_id][input_name] = input_dict
-                    else:
-                        action_dict[form_id] = {input_name: input_dict}
-            else:
-                action_dict[input_name] = {"input": input_dict}
-
-        input_elem.decompose()
-
-    for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
-        button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
-        if not button_name:
-            button_name = f'button_{len(action_dict)}'
-            
-        button_type = button.get('type', 'button')
-        button_value = button.get('value', button.text.strip())
-            
-        action_dict[button_name] = {
-            "button": {
-                "type": button_type,
-                "values": [button_value] if button_value else []
-            }
-        }
-            
-        button.decompose()
-
-    # handle command elements
-    for command in soup.find_all('command', recursive=True):
-        command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
-        if not command_name:
-                command_name = f'command_{len(action_dict)}'
-            
-        command_type = command.get('type', 'command')
-        command_value = command.get('value', command.text.strip())
-            
-        action_dict[command_name] = {
-            "command": {
-                "type": command_type,
-                "values": [command_value] if command_value else []
-            }
-        }
-            
-        command.decompose()
-
-    link_dict = {}
-    for img in soup.find_all('img', src=True, recursive=True):
-        src = img.get('src')
-        if src.startswith('#') or src.startswith('about:blank'):
-            src = None
-        text = img.get('alt', '').strip()
-        if src:
-            if not src.startswith(('http://', 'https://')):
-                src = urljoin(base_url, src)
-            key = f"url{len(link_dict)}"
-            link_dict[key] = src
-            text = f"{text}<img>[{key}]"
-
-        # find all area urls related to this img
-        area_urls = set()
-        if img.get('usemap'):
-            # remove the # at the beginning of the map name
-            map_name = img.get('usemap').lstrip('#')
-            # find the map tag
-            map_tag = soup.find('map', {'name': map_name})
-            if map_tag:
-                # get all area tags under the map
-                for area in map_tag.find_all('area', href=True):
-                    area_href = area.get('href')
-                    if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'):
-                        area_href = None
-                    if area_href:
-                        if not area_href.startswith(('http://', 'https://')):
-                            area_href = urljoin(base_url, area_href)
-                        area_urls.add(area_href)
-                        area.decompose()
-                # delete the whole map tag
-                map_tag.decompose()
-        for area_url in area_urls:
-            key = f"url{len(link_dict)}"
-            link_dict[key] = area_url
-            text = f"{text}[{key}]"
-
-        img.replace_with(f"-{text}")
-
-    for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True):
-        src = media.get('src')
-        if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'):
-            src = None
-        text = media.get('alt', '').strip() or media.get_text().strip()
-        if src:
-            # convert relative path to full url
-            if not src.startswith(('http://', 'https://')):
-                src = urljoin(base_url, src)
-            key = f"url{len(link_dict)}"
-            link_dict[key] = src
-            ext = os.path.splitext(src)[1].lstrip('.') or media.name
-            text = f"{text}<{ext}>[{key}]"
-
-        media.replace_with(f"-{text}")
-        
-    for obj in soup.find_all('object', data=True, recursive=True):
-        data = obj.get('data')
-        if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'):
-            data = None
-        text = obj.get('title', '').strip() or obj.get_text().strip()
-        if data:
-            # convert relative path to full url
-            if not data.startswith(('http://', 'https://')):
-                data = urljoin(base_url, data)
-            key = f"url{len(link_dict)}"
-            link_dict[key] = data
-            ext = os.path.splitext(data)[1].lstrip('.') or 'object'
-            text = f"{text}<{ext}>[{key}]"
-
-        obj.replace_with(f"-{text}")
-
-    # process links at last, so that we can keep the image and media info in the link
-    for a in soup.find_all('a', href=True, recursive=True):
-        href = a.get('href')
-        if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'):
-            href = None
-        if href:
-            text = a.get_text().strip() or '-'
-            if not href.startswith(('http://', 'https://')):
-                href = urljoin(base_url, href)
-            key = f"url{len(link_dict)}"
-            link_dict[key] = href
-            a.replace_with(f"{text}[{key}]")
-
-    # handle headings
-    for i in range(1, 7):  # h1 到 h6
-        for heading in soup.find_all(f'h{i}', recursive=False):
-            text = heading.get_text().strip()
-            heading.replace_with(f"{'#' * i} {text}\n")
-
-    # replace all <br> and <br/> tags with newlines
-    for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True):
-        br.replace_with('\n')
-        
-    # handle lists
-    for list_tag in soup.find_all(['ul', 'ol'], recursive=True):
-        list_text = []
-        for idx, item in enumerate(list_tag.find_all('li')):
-            list_text.append(f"{idx + 1}. {item.get_text().strip()}")
-        list_text = '\t'.join(list_text)
-        list_tag.replace_with(f"{list_text}\n")
-
-    # handle spans - merge span text with surrounding text
-    for span in soup.find_all('span', recursive=True):
-        span.replace_with(span.get_text().strip())
-        
-    # handle strikethrough text
-    for del_tag in soup.find_all(['del', 's'], recursive=True):
-        del_text = del_tag.get_text().strip()
-        if del_text:
-            del_tag.replace_with(f"{del_text}(maybe_outdated)")
-        else:
-            del_tag.decompose()
-        
-    # handle tables
-    for table in soup.find_all('table', recursive=True):
-        table_text = []
-            
-        # handle caption
-        caption = table.find('caption')
-        if caption:
-            table_text.append(caption.get_text().strip())
-            
-        # get headers
-        headers = []
-        for th in table.find_all('th'):
-            headers.append(th.get_text().strip())
-            
-        # handle all rows (including tbody and tfoot)
-        for row in table.find_all('tr'):
-            # get the first cell value
-            # try to find th as first_val
-            first_cell = row.find(['th', 'td'])
-            if not first_cell:
-                continue
-            first_val = first_cell.get_text().strip()
-            cells = row.find_all('td')
-            if not cells:
-                continue
-                
-            # handle remaining cells
-            for idx, cell in enumerate(cells):
-                cell_text = cell.get_text().strip()
-                if not cell_text or cell_text == first_val:
-                    continue
-  
-                header_text = headers[idx] if idx < len(headers) else ''
-                cell_str = f"{first_val}-{header_text}-{cell_text}"
-                table_text.append(cell_str)
-            
-        # replace the table with the processed text
-        table_text = '\n'.join(table_text)
-        table.replace_with(f"\n{table_text}\n")
-        
-    html_text = soup.get_text(strip=False, separator='\n')
-
-    return action_dict, link_dict, html_text
-    
--- a/core/utils/action_dict_scraper.py
+++ b/core/utils/action_dict_scraper.py
@ -0,0 +1,123 @@
+from bs4 import BeautifulSoup
+
+def action_dict_scraper(raw_html: str) -> dict:
+    soup = BeautifulSoup(raw_html, 'html.parser')
+    action_dict = {}
+    # handle form elements
+    for form in soup.find_all('form', recursive=True):
+        form_dict = {}
+        for input_elem in form.find_all('input'):
+            input_type = input_elem.get('type', 'text')
+            input_name = input_elem.get('name', f'input_{len(action_dict)}')
+            input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
+            input_dict = {
+                "type": input_type,
+                "values": [input_value] if input_value else []
+            }
+
+            # handle datalist
+            if input_elem.get('list'):
+                datalist = soup.find('datalist', id=input_elem['list'])
+                if datalist:
+                    options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
+                    input_dict = {
+                        "type": "text",
+                        "values": [f"one of followings: {options}"]
+                    }
+                
+            form_dict[input_name] = input_dict
+            
+        for select in form.find_all('select'):
+            select_name = select.get('name', f'select_{len(form_dict)}')
+            options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
+            form_dict[select_name] = {
+                "type": "select",
+                "values": options
+            }
+                
+        for textarea in form.find_all('textarea'):
+            textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
+            form_dict[textarea_name] = {
+                "type": "textarea", 
+                "values": [textarea.text.strip()]
+            }
+                
+        if form_dict:
+            form_id = form.get('id', f'form_{len(action_dict)}')
+            action_dict[form_id] = form_dict
+            
+        form.decompose()
+        
+    # handle input elements that are not in any form
+    for input_elem in soup.find_all('input', recursive=True):
+        if input_elem.find_parent('form') is None:
+            # check if the input is associated with a form by form attribute
+            form_ids = input_elem.get('form', '').split()
+                
+            # handle input element
+            input_type = input_elem.get('type', 'text')
+            input_name = input_elem.get('name', f'input_{len(action_dict)}')
+            input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
+            input_dict = {
+                "type": input_type,
+                "values": [input_value] if input_value else []
+            }
+
+            # handle datalist
+            if input_elem.get('list'):
+                datalist = soup.find('datalist', id=input_elem['list'])
+                if datalist:
+                    options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
+                    input_dict = {
+                        "type": "text",
+                        "values": [f"one of followings: {options}"]
+                    }
+                
+            # decide the placement of the input element based on form attribute
+            if form_ids:
+                for form_id in form_ids:
+                    if form_id in action_dict:
+                        action_dict[form_id][input_name] = input_dict
+                    else:
+                        action_dict[form_id] = {input_name: input_dict}
+            else:
+                action_dict[input_name] = {"input": input_dict}
+
+        input_elem.decompose()
+
+    for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
+        button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
+        if not button_name:
+            button_name = f'button_{len(action_dict)}'
+            
+        button_type = button.get('type', 'button')
+        button_value = button.get('value', button.text.strip())
+            
+        action_dict[button_name] = {
+            "button": {
+                "type": button_type,
+                "values": [button_value] if button_value else []
+            }
+        }
+            
+        button.decompose()
+
+    # handle command elements
+    for command in soup.find_all('command', recursive=True):
+        command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
+        if not command_name:
+                command_name = f'command_{len(action_dict)}'
+            
+        command_type = command.get('type', 'command')
+        command_value = command.get('value', command.text.strip())
+            
+        action_dict[command_name] = {
+            "command": {
+                "type": command_type,
+                "values": [command_value] if command_value else []
+            }
+        }
+            
+        command.decompose()
+
+    return action_dict
--- a/core/utils/deep_scraper.py
+++ b/core/utils/deep_scraper.py
@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+
+# This program requires HTML to be first converted to properly formatted text while preserving link positions and structural information (like crawl4ai's html2text work);
+# The complete media list from the webpage needs to be extracted beforehand
+# Currently this script only handles images and links, other elements like downloads and videos are not processed yet, todo: process according to media list
+# action_dict needs to be extracted from raw html, which is not covered by this script
+
+import os, re
+import json
+import time
+from urllib.parse import urlparse, urljoin
+
+
+common_file_exts = [
+    'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
+    'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
+    'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
+    'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
+    'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
+]
+common_tlds = [
+    '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
+    '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
+    '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
+    '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
+]
+
+common_chars = ',.!;:，；：、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
+
+def normalize_url(url: str) -> str:
+    if url.startswith("www."):
+        url = f"https://{url}"
+
+    parsed_url = urlparse(url)
+    if not parsed_url.netloc:
+        return ''
+    # 处理路径中的多余斜杠
+    path = re.sub(r'//+', '/', parsed_url.path)
+    # remove hash fragment
+    if not parsed_url.scheme:
+        # just try https
+        return f"https://{parsed_url.netloc}{path}{parsed_url.params}{parsed_url.query}"
+    else:
+        return f"{parsed_url.scheme}://{parsed_url.netloc}{path}{parsed_url.params}{parsed_url.query}"
+
+
+def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> tuple[dict, tuple[str, dict]]:
+    link_dict = {}
+    def check_url_text(text):
+        text = text.strip()
+        left_bracket = text.find('[')
+        right_paren = text.rfind(')')
+        
+        if -1 in [left_bracket, right_paren] or left_bracket > right_paren:
+            return text
+        
+        # 检查左括号前的文本是否包含至少2个有效字符
+        prefix = text[:left_bracket]
+        pre_valid_chars = [c for c in prefix if not c.isdigit() and c not in common_chars]
+        if len(pre_valid_chars) >= 50:
+            return text
+
+        suffix = text[right_paren+1:]
+        suf_valid_chars = [c for c in suffix if c not in common_chars]
+        if len(pre_valid_chars) >= 2 and len(suf_valid_chars) >= 1:
+            return text
+
+        if len(suf_valid_chars) >= 36:
+            return text
+
+        # 处理图片标记 ![alt](src)
+        img_pattern = r'!\[(.*?)\]\((.*?)\)'
+        matches = re.findall(img_pattern, text)
+        
+        for alt, src in matches:
+            # 替换为新格式 §alt||src§
+            text = text.replace(f'![{alt}]({src})', f'§{alt}||{src}§')
+            
+        # 找到所有[part0](part1)格式的片段
+        link_pattern = r'\[(.*?)\]\((.*?)\)'
+        matches = re.findall(link_pattern, text)
+        # 从text中去掉所有matches部分
+        for link_text, link_url in matches:
+            text = text.replace(f'[{link_text}]({link_url})', '')
+
+        img_marker_pattern = r'§(.*?)\|\|(.*?)§'
+        img_marker_matches = re.findall(img_marker_pattern, text)
+        alt_img_alt = ""
+        alt_img_src = ""
+        if img_marker_matches:
+            alt_img_alt = img_marker_matches[0][0]
+            alt_img_src = img_marker_matches[0][1]
+        for alt, src in img_marker_matches:
+            text = text.replace(f'§{alt}||{src}§', '')
+
+        text = text.strip()
+        
+        for link_text, link_url in matches:
+            # 处理 \"***\" 格式的片段
+            quote_pattern = r'\"(.*?)\"'
+            # 提取所有引号包裹的内容
+            link_alt = ''.join(re.findall(quote_pattern, link_url))
+            if link_alt not in link_text:
+                link_text = f"{link_text} {link_alt}"
+            # 去掉所有引号包裹的内容
+            _url = re.sub(quote_pattern, '', link_url).strip()
+            if not _url or _url.startswith('#'):
+                continue
+            if _url.startswith('//'):
+                _url = f"https:{_url}"
+            else:
+                if _url.startswith('/'):
+                    _url = _url[1:]
+                _url = urljoin(base_url, _url)
+            _url = normalize_url(_url)
+            if not _url:
+                continue
+
+            url = _url.lower()
+            # 检查链接是否是常见文件类型或顶级域名
+            has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
+            has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
+            if has_common_ext or has_common_tld:
+                continue
+
+            # 分离§§内的内容和后面的内容
+            link_text = link_text.strip()
+            inner_matches = re.findall(img_marker_pattern, link_text)
+            for alt, src in inner_matches:
+                link_text = link_text.replace(f'§{alt}||{src}§', '')
+            link_text = link_text.strip()
+
+            if text not in link_text:
+                link_text = f"{link_text} {text}"
+
+            # 去除首尾的common_chars和数字
+            link_text = link_text.strip(''.join(common_chars + '0123456789'))
+            if len(link_text) >= 3:
+                if url not in link_dict:
+                    link_dict[url] = link_text
+                else:
+                    if link_dict[url].startswith("§to_be_recognized_by_visual_llm_"):
+                        link_dict[url] = link_text
+                    else:
+                        link_dict[url] = f"{link_dict[url]} {link_text}"
+
+            if url in link_dict:
+                continue
+            
+            img_alt = ""
+            img_src = ""
+            if inner_matches:
+                img_alt = inner_matches[0][0].strip()
+                img_src = inner_matches[0][1].strip()
+
+            if not img_src and alt_img_src:
+                img_src = alt_img_src
+                img_alt = alt_img_alt
+
+            if not img_src:
+                continue
+
+            img_src = img_src.lower()
+            if any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
+                continue
+            if any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
+                continue
+
+            if not img_src or img_src.startswith('#'):
+                continue
+            if img_src.startswith('//'):
+                img_src = f"https:{img_src}"
+            else:
+                if img_src.startswith('/'):
+                    img_src = img_src[1:]
+                img_src = urljoin(base_url, img_src)
+            img_src = normalize_url(img_src)
+            if not img_src:
+                continue
+            link_dict[url] = f"{img_alt}§to_be_recognized_by_visual_llm_{img_src}§"
+            
+        return ''
+
+    texts = raw_markdown.split('\n\n')
+    texts = [check_url_text(text) for text in texts]
+    texts = [text for text in texts if text.strip()]
+    html_text = '\n\n'.join(texts)
+
+    # 处理图片标记 ![alt](src)
+    img_pattern = r'(!\[.*?\]\(.*?\))'
+    matches = re.findall(img_pattern, html_text)
+    for match in matches:
+        src = re.search(r'!\[.*?\]\((.*?)\)', match).group(1)
+        if src not in used_img:
+            html_text = html_text.replace(match, '')
+            continue
+
+        alt = used_img[src]
+        src = src.strip().lower()
+        if any(src.endswith(tld) or src.endswith(tld + '/') for tld in common_tlds):
+            html_text = html_text.replace(match, alt)
+            continue
+        if any(src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
+            html_text = html_text.replace(match, alt)
+            continue
+
+        if not src or src.startswith('#'):
+            html_text = html_text.replace(match, alt)
+            continue
+        if src.startswith('//'):
+            src = f"https:{src}"
+        else:
+            if src.startswith('/'):
+                src = src[1:]
+            src = urljoin(base_url, src)
+        src = normalize_url(src)
+        if not src:
+            html_text = html_text.replace(match, alt)
+            continue
+        html_text = html_text.replace(match, f" {alt}§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern
+    
+    # 接下来要处理所有的[]()文本了
+    link_pattern = r'\[(.*?)\]\((.*?)\)'
+    matches = re.findall(link_pattern, html_text)
+    text_link_map = {}
+    for match in matches:
+        link_text, link_url = match
+        original_markdown = f'[{link_text}]({link_url})'  # 重建原始的 markdown 链接格式
+        # 处理 \"***\" 格式的片段
+        quote_pattern = r'\"(.*?)\"'
+        # 提取所有引号包裹的内容
+        link_alt = ''.join(re.findall(quote_pattern, link_url))
+        if link_alt not in link_text:
+            link_text = f"{link_text} {link_alt}"
+        # 去掉所有引号包裹的内容
+        _url = re.sub(quote_pattern, '', link_url).strip()
+        if not _url or _url.startswith('#'):
+            continue
+        if _url.startswith('//'):
+            _url = f"https:{_url}"
+        else:
+            if _url.startswith('/'):
+                _url = _url[1:]
+            _url = urljoin(base_url, _url)
+        _url = normalize_url(_url)
+        if not _url:
+            continue
+        url = _url.lower()
+        key = f"Ref_{len(text_link_map)+1}"
+        text_link_map[key] = url
+
+        html_text = html_text.replace(original_markdown, f'{link_text}[{key}]')
+    
+    # 处理文本中的"野 url"
+    url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
+    matches = re.findall(url_pattern, html_text)
+    for url in matches:
+        url = normalize_url(url)
+        if not url:
+            continue
+        key = f"Ref_{len(text_link_map)+1}"
+        text_link_map[key] = url
+        html_text = html_text.replace(url, f'[{key}]')
+    
+    # 去掉文本中所有残存的[]和![]
+    html_text = html_text.replace('![]', '')  # 去掉![]
+    html_text = html_text.replace('[]', '')  # 去掉[]
+
+    return link_dict, (html_text, text_link_map)
+        
--- a/test/craw4ai_fetching.py
+++ b/test/craw4ai_fetching.py
@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
+import os
+import hashlib
+import json
+
+
+sites = [
+    "https://www.cnaiplus.com/a/news/?btwaf=75608141"
+]
+
+save_dir = 'webpage_samples'
+
+async def main(sites: list):
+    failed_record = []
+    config = CrawlerRunConfig(
+        delay_before_return_html=2.0,
+        exclude_social_media_links=True,
+        magic=True,
+        scan_full_page=True,
+        remove_overlay_elements=True
+        )
+    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+        for site in sites:
+            # 排除社交媒体 todo 后续要用特定爬虫
+            # 排除自动爬虫
+            # 排除已经爬过的
+            result = await crawler.arun(url=site, crawler_config=config, cache_mode=CacheMode.BYPASS)
+            if not result.success:
+                failed_record.append(site)
+                continue
+            
+            record_file = os.path.join(save_dir, f"{hashlib.sha256(site.encode()).hexdigest()[-6:]}.json")
+            with open(record_file, 'w', encoding='utf-8') as f:
+                json.dump(result.model_dump(), f, indent=4, ensure_ascii=False)
+
+
+if __name__ == '__main__':
+    asyncio.run(main(sites))
--- a/test/crawlee_fetching.py
+++ b/test/crawlee_fetching.py
@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+from bs4 import BeautifulSoup
+import os
+import json
+import asyncio
+from urllib.parse import urlparse, urljoin
+import hashlib
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
+from datetime import timedelta
+
+
+sites = ["https://www.gzhu.edu.cn/", "https://www.cnaiplus.com/a/news/?btwaf=75608141"]
+
+os.environ['CRAWLEE_STORAGE_DIR'] = 'webpage_samples/crawlee_storage'
+save_dir = 'webpage_samples'
+
+async def main(sites: list):
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        # max_requests_per_crawl=1,
+        max_request_retries=1,
+        request_handler_timeout=timedelta(minutes=5)
+    )
+
+    @crawler.pre_navigation_hook
+    async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
+        context.log.info(f'navigating {context.request.url} ...')
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        await context.page.wait_for_load_state('networkidle')
+        await context.page.wait_for_timeout(2000)
+
+        # Handle dialogs (alerts, confirms, prompts)
+        async def handle_dialog(dialog):
+            context.log.info(f'Closing dialog: {dialog.message}')
+            await dialog.accept()
+        context.page.on('dialog', handle_dialog)
+
+        context.log.info('successfully finish fetching')
+
+        file = os.path.join(save_dir, f"{hashlib.sha256(context.request.url.encode()).hexdigest()[-6:]}.json")
+
+        html = await context.page.inner_html('head')
+        soup = BeautifulSoup(html, 'html.parser')
+        web_title = soup.find('title')
+        if web_title:
+            web_title = web_title.get_text().strip()
+        else:
+            web_title = ''
+
+        base_tag = soup.find('base', href=True)
+        if base_tag and base_tag.get('href'):
+            base_url = base_tag['href']
+        else:
+            # if no base tag, use the current url as base url
+            parsed_url = urlparse(context.request.url)
+            domain = parsed_url.netloc
+            base_url = f"{parsed_url.scheme}://{domain}"
+
+        html = await context.page.inner_html('body')
+        raw_html = {
+            "url": context.request.url,
+            "web_title": web_title,
+            "base_url": base_url,
+            "html": html
+        }
+
+        with open(file, 'w', encoding='utf-8') as f:
+            json.dump(raw_html, f, indent=4, ensure_ascii=False)
+
+    await crawler.run(sites)
+
+if __name__ == '__main__':
+    asyncio.run(main(sites))
--- a/test/deep_scraper_test.py
+++ b/test/deep_scraper_test.py
@ -0,0 +1,116 @@
+import os
+import sys
+import re
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)  # 获取父目录
+sys.path.append(project_root)
+
+from core.utils.deep_scraper import deep_scraper, common_chars, common_file_exts, common_tlds 
+
+test_string = ''
+
+def check_url_text(text):
+    print(f"processing: {text}")
+    left_bracket = text.find('[')
+    right_paren = text.rfind(')')
+        
+    if -1 in [left_bracket, right_paren] or left_bracket > right_paren:
+        print("not [] or () marker")
+        print(f"left_bracket: {left_bracket}, right_paren: {right_paren}")
+        return
+        
+    # 检查左括号前的文本是否包含至少2个有效字符
+    prefix = text[:left_bracket]
+    pre_valid_chars = [c for c in prefix if not c.isdigit() and c not in common_chars]
+    if len(pre_valid_chars) >= 50:
+        print("prefix has at least 50 valid chars")
+        print(f"prefix: {prefix}, valid_chars: {pre_valid_chars}")
+        return
+
+    suffix = text[right_paren+1:]
+    suf_valid_chars = [c for c in suffix if c not in common_chars]
+    if len(pre_valid_chars) >= 2 and len(suf_valid_chars) >= 1:
+        print("prefix has at least 2 valid chars and suffix has at least 1 valid char")
+        print(f"prefix: {prefix}, valid_chars: {pre_valid_chars}, suffix: {suffix}, valid_chars: {suf_valid_chars}")
+        return
+
+    if len(suf_valid_chars) >= 36:
+        print("suffix has at least 36 valid chars")
+        print(f"suffix: {suffix}, valid_chars: {suf_valid_chars}")
+        return
+        
+    print('is a isolated url')
+
+    print("处理图片标记 ![alt](src)")
+    img_pattern = r'!\[(.*?)\]\((.*?)\)'
+    matches = re.findall(img_pattern, text)
+  
+    for alt, src in matches:
+        # 替换为新格式 <alt||src>
+        text = text.replace(f'![{alt}]({src})', f'§{alt}||{src}§')
+    print(text)
+
+
+if __name__ == '__main__':
+    import argparse
+    import time
+    import json
+    from urllib.parse import urlparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test_file', '-F', type=str, default='')
+    parser.add_argument('--sample_dir', '-D', type=str, default='')
+    args = parser.parse_args()
+
+    test_file = args.test_file
+    sample_dir = args.sample_dir
+
+    files = []
+    if test_file:
+        files.append(test_file)
+    
+    if sample_dir:
+        files.extend([os.path.join(sample_dir, file) for file in os.listdir(sample_dir)])
+
+    result_folder = sample_dir if sample_dir else '.'
+
+    for file in files:
+        if not file.endswith('.json'): continue
+
+        print(f"processing {file} ...")
+        try:
+            with open(file, 'r') as f:
+                html_sample = json.load(f)
+            _url = html_sample['url']
+            raw_markdown = html_sample['markdown']
+            used_img = {d['src']: d['alt'] for d in html_sample['media']['images']}
+        except Exception as e:
+            print('sample format error, try to use craw4ai_fething.py to get sample')
+            print(f"error: {e}")
+            continue
+
+        parsed_url = urlparse(_url)
+        domain = parsed_url.netloc
+        base_url = f"{parsed_url.scheme}://{domain}"
+
+        time_start = time.time()
+        from_html_link_dict, (from_html_text, from_html_text_link_map) = deep_scraper(raw_markdown, base_url, used_img)
+        time_end = time.time()
+        print(f"time cost for html: {time_end - time_start}s")
+
+        result = {
+            "link_dict": from_html_link_dict,
+            "text": from_html_text,
+            "text_link_map": from_html_text_link_map,
+        }
+        record_folder = file.replace('.json', '')
+        os.makedirs(record_folder, exist_ok=True)
+        with open(os.path.join(record_folder, 'sample.json'), 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=4, ensure_ascii=False)
+        print("done")
+        print("*" * 12)
+
+    if test_string:
+        check_url_text(test_string)
+        exit()
--- a/test/fetching_for_sample.py
+++ b/test/fetching_for_sample.py
@ -1,366 +0,0 @@
-# -*- coding: utf-8 -*-
-from bs4 import BeautifulSoup
-import os
-import json
-import asyncio
-from urllib.parse import urlparse, urljoin
-import hashlib
-from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext
-from datetime import timedelta
-
-
-sites = ["https://www.gd121.cn/zx/qxzx/list.shtml",
-]
-
-os.environ['CRAWLEE_STORAGE_DIR'] = 'webpage_samples/crawlee_storage'
-save_dir = 'webpage_samples'
-
-async def main(sites: list):
-    crawler = PlaywrightCrawler(
-        # Limit the crawl to max requests. Remove or increase it for crawling all links.
-        # max_requests_per_crawl=1,
-        max_request_retries=1,
-        request_handler_timeout=timedelta(minutes=5)
-    )
-
-    @crawler.pre_navigation_hook
-    async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
-        context.log.info(f'navigating {context.request.url} ...')
-
-    @crawler.router.default_handler
-    async def request_handler(context: PlaywrightCrawlingContext) -> None:
-        await context.page.wait_for_load_state('networkidle')
-        await context.page.wait_for_timeout(2000)
-
-        # Handle dialogs (alerts, confirms, prompts)
-        async def handle_dialog(dialog):
-            context.log.info(f'Closing dialog: {dialog.message}')
-            await dialog.accept()
-        context.page.on('dialog', handle_dialog)
-
-        context.log.info('successfully finish fetching')
-
-        folder = os.path.join(save_dir, f"{hashlib.sha256(context.request.url.encode()).hexdigest()[-6:]}")
-        os.makedirs(folder, exist_ok=True)
-
-        html = await context.page.inner_html('head')
-        soup = BeautifulSoup(html, 'html.parser')
-        web_title = soup.find('title')
-        if web_title:
-            web_title = web_title.get_text().strip()
-        else:
-            web_title = ''
-
-        base_tag = soup.find('base', href=True)
-        if base_tag and base_tag.get('href'):
-            base_url = base_tag['href']
-        else:
-            # if no base tag, use the current url as base url
-            parsed_url = urlparse(context.request.url)
-            domain = parsed_url.netloc
-            base_url = f"{parsed_url.scheme}://{domain}"
-
-        html = await context.page.inner_html('body')
-
-        # to use a customer scaper here
-
-        soup = BeautifulSoup(html, 'html.parser')
-
-        # 移除导航、页眉、页脚等通用元素
-        for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']:
-            elements = soup.select(selector)
-            for element in elements:
-                element.decompose()
-
-        action_dict = {}
-        for form in soup.find_all('form', recursive=True):
-            form_dict = {}
-            for input_elem in form.find_all('input'):
-                input_type = input_elem.get('type', 'text')
-                input_name = input_elem.get('name', f'input_{len(action_dict)}')
-                input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
-                input_dict = {
-                    "type": input_type,
-                    "values": [input_value] if input_value else []
-                }
-
-                # handle datalist
-                if input_elem.get('list'):
-                    datalist = soup.find('datalist', id=input_elem['list'])
-                    if datalist:
-                        options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
-                        input_dict = {
-                            "type": "text",
-                            "values": [f"one of followings: {options}"]
-                        }
-                
-                form_dict[input_name] = input_dict
-            
-            for select in form.find_all('select'):
-                select_name = select.get('name', f'select_{len(form_dict)}')
-                options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
-                form_dict[select_name] = {
-                    "type": "select",
-                    "values": options
-                }
-                
-            for textarea in form.find_all('textarea'):
-                textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
-                form_dict[textarea_name] = {
-                    "type": "textarea", 
-                    "values": [textarea.text.strip()]
-                }
-                
-            if form_dict:
-                form_id = form.get('id', f'form_{len(action_dict)}')
-                action_dict[form_id] = form_dict
-            
-            form.decompose()
-        
-        # handle input elements that are not in any form
-        for input_elem in soup.find_all('input', recursive=True):
-            if input_elem.find_parent('form') is None:
-                # check if the input is associated with a form by form attribute
-                form_ids = input_elem.get('form', '').split()
-                
-                # handle input element
-                input_type = input_elem.get('type', 'text')
-                input_name = input_elem.get('name', f'input_{len(action_dict)}')
-                input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
-                input_dict = {
-                    "type": input_type,
-                    "values": [input_value] if input_value else []
-                }
-
-                # handle datalist
-                if input_elem.get('list'):
-                    datalist = soup.find('datalist', id=input_elem['list'])
-                    if datalist:
-                        options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
-                        input_dict = {
-                            "type": "text",
-                            "values": [f"one of followings: {options}"]
-                        }
-                # decide the placement of the input element based on form attribute
-                if form_ids:
-                    for form_id in form_ids:
-                        if form_id in action_dict:
-                            action_dict[form_id][input_name] = input_dict
-                        else:
-                            action_dict[form_id] = {input_name: input_dict}
-                else:
-                    action_dict[input_name] = {"input": input_dict}
-
-            input_elem.decompose()
-
-        for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
-            button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
-            if not button_name:
-                button_name = f'button_{len(action_dict)}'
-            
-            button_type = button.get('type', 'button')
-            button_value = button.get('value', button.text.strip())
-            
-            action_dict[button_name] = {
-                "button": {
-                    "type": button_type,
-                    "values": [button_value] if button_value else []
-                }
-            }
-            
-            button.decompose()
-
-        for command in soup.find_all('command', recursive=True):
-            command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
-            if not command_name:
-                command_name = f'command_{len(action_dict)}'
-            
-            command_type = command.get('type', 'command')
-            command_value = command.get('value', command.text.strip())
-            
-            action_dict[command_name] = {
-                "command": {
-                    "type": command_type,
-                    "values": [command_value] if command_value else []
-                }
-            }
-            
-            command.decompose()
-
-        link_dict = {}
-        for img in soup.find_all('img', src=True, recursive=True):
-            src = img.get('src')
-            if src.startswith('#') or src.startswith('about:blank'):
-                src = None
-            text = img.get('alt', '').strip()
-            if src:
-                if not src.startswith(('http://', 'https://')):
-                    src = urljoin(base_url, src)
-                key = f"url{len(link_dict)}"
-                link_dict[key] = src
-                text = f"{text}<img>[{key}]"
-
-            # find all area urls related to this img
-            area_urls = set()
-            if img.get('usemap'):
-                # remove the # at the beginning of the map name
-                map_name = img.get('usemap').lstrip('#')
-                # find the map tag
-                map_tag = soup.find('map', {'name': map_name})
-                if map_tag:
-                    # get all area tags under the map
-                    for area in map_tag.find_all('area', href=True):
-                        area_href = area.get('href')
-                        if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'):
-                            area_href = None
-                        if area_href:
-                            if not area_href.startswith(('http://', 'https://')):
-                                area_href = urljoin(base_url, area_href)
-                            area_urls.add(area_href)
-                            area.decompose()
-                    # delete the whole map tag
-                    map_tag.decompose()
-            for area_url in area_urls:
-                if area_url in [context.request.url, base_url]:
-                    continue
-                key = f"url{len(link_dict)}"
-                link_dict[key] = area_url
-                text = f"{text}[{key}]"
-
-            img.replace_with(f"-{text}")
-
-        for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True):
-            src = media.get('src')
-            if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'):
-                src = None
-            text = media.get('alt', '').strip() or media.get_text().strip()
-            if src:
-                # convert relative path to full url
-                if not src.startswith(('http://', 'https://')):
-                    src = urljoin(context.request.url, src)
-                key = f"url{len(link_dict)}"
-                link_dict[key] = src
-                ext = os.path.splitext(src)[1].lstrip('.') or media.name
-                text = f"{text}<{ext}>[{key}]"
-
-            media.replace_with(f"-{text}")
-        
-        for obj in soup.find_all('object', data=True, recursive=True):
-            data = obj.get('data')
-            if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'):
-                data = None
-            text = obj.get('title', '').strip() or obj.get_text().strip()
-            if data:
-                # convert relative path to full url
-                if not data.startswith(('http://', 'https://')):
-                    data = urljoin(context.request.url, data)
-                key = f"url{len(link_dict)}"
-                link_dict[key] = data
-                ext = os.path.splitext(data)[1].lstrip('.') or 'object'
-                text = f"{text}<{ext}>[{key}]"
-
-            obj.replace_with(f"-{text}")
-
-        # process links at last, so that we can keep the image and media info in the link
-        for a in soup.find_all('a', href=True, recursive=True):
-            href = a.get('href')
-            if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'):
-                href = None
-            if href:
-                text = a.get_text().strip() or '-'
-                if not href.startswith(('http://', 'https://')):
-                    href = urljoin(context.request.url, href)
-                if href in [context.request.url, base_url]:
-                    continue
-                key = f"url{len(link_dict)}"
-                link_dict[key] = href
-                a.replace_with(f"{text}[{key}]")
-
-        # handle headings
-        for i in range(1, 7):  # h1 到 h6
-            for heading in soup.find_all(f'h{i}', recursive=False):
-                text = heading.get_text().strip()
-                heading.replace_with(f"{'#' * i} {text}\n")
-
-        # replace all <br> and <br/> tags with newlines
-        for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True):
-            br.replace_with('\n')
-        
-        # handle lists
-        for list_tag in soup.find_all(['ul', 'ol'], recursive=True):
-            list_text = []
-            for idx, item in enumerate(list_tag.find_all('li')):
-                list_text.append(f"{idx + 1}. {item.get_text().strip()}")
-            list_text = '\t'.join(list_text)
-            list_tag.replace_with(f"{list_text}\n")
-
-        # handle spans - merge span text with surrounding text
-        for span in soup.find_all('span', recursive=True):
-            span.replace_with(span.get_text().strip())
-        
-        # handle strikethrough text
-        for del_tag in soup.find_all(['del', 's'], recursive=True):
-            del_text = del_tag.get_text().strip()
-            if del_text:
-                del_tag.replace_with(f"{del_text}(maybe_outdated)")
-            else:
-                del_tag.decompose()
-        
-        # handle tables
-        for table in soup.find_all('table', recursive=True):
-            table_text = []
-            
-            # handle caption
-            caption = table.find('caption')
-            if caption:
-                table_text.append(caption.get_text().strip())
-            
-            # get headers
-            headers = []
-            for th in table.find_all('th'):
-                headers.append(th.get_text().strip())
-            
-            # handle all rows (including tbody and tfoot)
-            for row in table.find_all('tr'):
-                # get the first cell value
-                # try to find th as first_val
-                first_cell = row.find(['th', 'td'])
-                if not first_cell:
-                    continue
-                first_val = first_cell.get_text().strip()
-                cells = row.find_all('td')
-                if not cells:
-                    continue
-                
-                # handle remaining cells
-                for idx, cell in enumerate(cells):
-                    cell_text = cell.get_text().strip()
-                    if not cell_text or cell_text == first_val:
-                        continue
-  
-                    header_text = headers[idx] if idx < len(headers) else ''
-                    cell_str = f"{first_val}-{header_text}-{cell_text}"
-                    table_text.append(cell_str)
-            
-            # replace the table with the processed text
-            table_text = '\n'.join(table_text)
-            table.replace_with(f"\n{table_text}\n")
-        
-        html_text = soup.get_text(strip=False, separator='\n')
-
-        with open(os.path.join(folder, 'text.txt'), 'w') as f:
-            f.write(html_text)
-
-        with open(os.path.join(folder, 'link_dict.json'), 'w', encoding='utf-8') as f:
-            json.dump(link_dict, f, indent=4, ensure_ascii=False)
-        
-        with open(os.path.join(folder, 'action_dict.json'), 'w', encoding='utf-8') as f:
-            json.dump(action_dict, f, indent=4, ensure_ascii=False)
-
-        # screenshot_file = os.path.join(folder, 'screenshot.jpg')
-        # await context.page.screenshot(path=screenshot_file, full_page=True)
-
-    await crawler.run(sites)
-
-if __name__ == '__main__':
-    asyncio.run(main(sites))
--- a/test/find_article_or_list.py
+++ b/test/find_article_or_list.py
@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import os, re
-import json
-import time
-
-
-sample_dir = 'webpage_samples'
-list_judge_threshold = 0.007
-valid_list_min_length = 10
-min_content_length = 420
-
-common_file_exts = [
-    'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
-    'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
-    'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
-    'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
-    'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
-]
-common_tlds = [
-    '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
-    '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
-    '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
-    '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
-]
-
-def find_article_or_list(link_dict, text) -> (bool, bool, str):
-    lines = [l.strip() for l in text.split('\n') if l.strip()]
-    text = '\n'.join(lines)
-    text_no_tags = re.sub(r'<\w{1,5}>', '', text)
-    text_no_urls = re.sub(r'\[url\d+]', '', text_no_tags)
-    content_length = len(text_no_urls)
-
-    valid_url = set()
-    for url in link_dict.values():
-        url_lower = url.lower()
-        has_common_ext = any(url_lower.endswith(ext) for ext in common_file_exts)
-        has_common_tld = any(url_lower.endswith(tld) or url_lower.endswith(tld + '/') for tld in common_tlds)
-        if not has_common_ext and not has_common_tld:
-            valid_url.add(url)
-
-    valid_url_rate = len(valid_url) / content_length
-    is_list = valid_url_rate > 0.007 and len(valid_url) > valid_list_min_length
-    need_more_info = content_length < min_content_length
-    return is_list, need_more_info, text
- 
-
-if __name__ == '__main__':
-    dirs = os.listdir(sample_dir)
-    for _dir in dirs:
-        if not _dir.startswith('task'):
-            continue
-        _path = os.path.join(sample_dir, _dir)
-        if not os.path.isdir(_path):
-            continue
-
-        samples = os.listdir(_path)
-        time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        record_file = os.path.join(_path, f'article_or_list_judge.txt')
-        for sample in samples:
-            if not os.path.isdir(os.path.join(_path, sample)):
-                continue
-            files = os.listdir(os.path.join(_path, sample))
-            if 'link_dict.json' not in files or 'text.txt' not in files:
-                print(f'{sample} files not complete, skip')
-                continue
-            link_dict = json.load(open(os.path.join(_path, sample, 'link_dict.json'), 'r'))
-            text = open(os.path.join(_path, sample, 'text.txt'), 'r').read()
-            is_list, need_more_info, text = find_article_or_list(link_dict, text)
-            with open(record_file, 'a') as f:
-                f.write(f"raw materials: {sample}\n\n")
-                f.write(f"cleaned text: \n{text}\n\n")
-                f.write("list\n" if is_list else "article\n")
-                f.write("need more info\n" if need_more_info else "no need more info\n")
-                f.write("*" * 12)
-                f.write('\n\n')
--- a/test/get_info_test.py
+++ b/test/get_info_test.py
@ -71,9 +71,9 @@ async def extract_info_from_img(text, link_dict) -> str:
        if url in cache:
            replace_text = cache[url]
        else:
-            if any(url.lower().endswith(tld) or url.lower().endswith(tld + '/') for tld in common_tlds):
+            if any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds):
                continue
-            if any(url.lower().endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
+            if any(url.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                continue
            llm_output = await llm([{"role": "user",
                                "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
@ -86,7 +86,7 @@ async def extract_info_from_img(text, link_dict) -> str:


 async def main(link_dict, text, record_file, prompts):
-    is_list, need_more_info, text = find_article_or_list(link_dict, text)
+    is_list, need_more_info, link_dict, text = find_article_or_list(link_dict, text)

    if is_list:
        print("may be a article list page, get more urls ...")
@ -124,9 +124,9 @@ async def main(link_dict, text, record_file, prompts):
                        hallucination_times += 1
                        continue
                    result_url = link_dict[url_tag]
-                if any(result_url.lower().endswith(tld) or result_url.lower().endswith(tld + '/') for tld in common_tlds):
+                if any(result_url.endswith(tld) or result_url.endswith(tld + '/') for tld in common_tlds):
                    continue
-                if any(result_url.lower().endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
+                if any(result_url.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
                    continue
                final_result.add(f'{item} {result_url}')
            else:
--- a/test/prompts.py
+++ b/test/prompts.py
@ -8,7 +8,8 @@ text_info_system = '''作为信息提取助手，你的任务是从给定的网
 - 如果关注点有进一步的解释，确保提取的内容符合这些解释的范围
 - 忠于原文，你的任务是从网页文本中抽取相关信息，而不是提炼、总结和改写
 - 对于最终输出的信息，请保证主体、时间、地点等关键要素的清晰明确，为此可能需要综合上下文进行提取
- 如果提取的内容中包括类似“<mp4>”、“[url1]”这样的片段，务必原样保留'''
+- 由于文本是通过爬虫程序获取并转化的，所以请忽略所有残存的html标签以及不必要的空格、换行等
+- 但如果提取的内容中包括类似“[Ref_1]”这样的片段，务必原样保留'''

 text_info_suffix = '''请先复述一遍关注点及其解释，再对原文进行分析。如果网页文本中包含关注点相关的内容，请按照以下json格式输出提取的信息：
 {"focus": 关注点名称, "content": 提取的内容}
--- a/test/webpage_samples/348a2f/348a2f.json
+++ b/test/webpage_samples/348a2f/348a2f.json
--- a/test/webpage_samples/348a2f/sample.json
+++ b/test/webpage_samples/348a2f/sample.json
--- a/test/webpage_samples/3d7381/3d7381.json
+++ b/test/webpage_samples/3d7381/3d7381.json
--- a/test/webpage_samples/3d7381/sample.json
+++ b/test/webpage_samples/3d7381/sample.json
@ -0,0 +1,7 @@
+{
+    "link_dict": {
+        "微信公众平台广告规范指引": "javacript:;"
+    },
+    "text": "#  同心之声 乐启新章 丨赵巷镇统一战线迎新音乐会邀您：重温经典旋律，唤醒美好回忆 \n\n in赵巷 ;) _2024年12月29日 10:44_ _上海_\n\n预览时标签不可点\n\n关闭\n\n更多\n\n名称已清空\n\n**微信扫一扫赞赏作者**\n\n喜欢作者其它金额\n\n文章\n\n暂无文章\n\n喜欢作者\n\n其它金额\n\n¥\n\n最低赞赏 ¥0\n\n确定\n\n返回\n\n**其它金额**\n\n更多\n\n赞赏金额\n\n¥\n\n最低赞赏 ¥0\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n0\n\n.\n\n关闭\n\n更多\n\n小程序\n\n广告\n\n搜索「undefined」网络结果\n\n\n\n暂无留言\n\n已无更多数据\n\n写留言:\n\n关闭\n\n**写留言**\n\n提交更多\n\n微信扫一扫关注该公众号\n\n继续滑动看下一个\n\n轻触阅读原文\n\nin赵巷 \n\n向上滑动看下一个\n\n当前内容可能存在未经审核的第三方商业营销信息，请确认是否继续访问。\n\n微信扫一扫使用小程序 \n\n× 分析\n\nin赵巷\n\n关注 \n\n赞分享在看 写留言 \n\n**留言**\n\n暂无留言\n\n已无更多数据\n\n写留言:\n\n关闭\n\n**写留言**\n\n提交更多\n\n关闭\n\n**0个朋友**\n\n更多\n\n前往「发现 > 看一看」看更多朋友\n\n： ， ， ， ， ， ， ， ， ， ， ， ， 。 视频 小程序 赞 ，轻点两下取消赞 在看 ，轻点两下取消在看 分享 留言 收藏\n\n**in赵巷**\n\n同心之声 乐启新章 丨赵巷镇统一战线迎新音乐会邀您：重温经典旋律，唤醒美好回忆 \n\n,\n\n关闭\n\n**选择留言身份**\n\n更多\n",
+    "text_link_map": {}
+}
--- a/test/webpage_samples/775d04/775d04.json
+++ b/test/webpage_samples/775d04/775d04.json
--- a/test/webpage_samples/775d04/sample.json
+++ b/test/webpage_samples/775d04/sample.json
--- a/test/webpage_samples/7e9e2e/7e9e2e.json
+++ b/test/webpage_samples/7e9e2e/7e9e2e.json
--- a/test/webpage_samples/7e9e2e/sample.json
+++ b/test/webpage_samples/7e9e2e/sample.json
@ -0,0 +1,89 @@
+{
+    "link_dict": {
+        "https://www.gd121.cn/index.shtml": "首页 * \n  * \n    * ;)",
+        "https://www.gd121.cn/yj/index.shtml": "预警 * \n  * \n    * ;) 预警信息",
+        "https://www.gd121.cn/tq/jdtq_list.shtml": "旅游天气",
+        "https://www.gd121.cn/tq/jccz/list.shtml": "机场&车站",
+        "https://www.gd121.cn/hd/index.shtml": "预约天气",
+        "https://www.gd121.cn/hd/yhfk/list.shtml": "用户反馈",
+        "https://www.gd121.cn/hd/sjsc/list.shtml": "实景上传",
+        "https://passport.gd121.cn/ucy_center/index": "用户中心",
+        "https://www.gd121.cn/fw/yjkhd/8a14d310645ec72401645ecbea730edb.shtml": "[详情] 停课铃客户端 停课铃客户端 台风停不停课，还要等老师给你打电话？不用了，“停课铃”就有官方消息！！“停 停课铃APP",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed02f5a1a1f.shtml": "气象短信服务介绍",
+        "https://www.gd121.cn/fw/dx/list.shtml": "天气短信",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed0305c1a21.shtml": "家乡天气短信",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed033821a27.shtml": "周天气短信",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed034891a29.shtml": "田园气象站天气短信",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed02e7b1a1d.shtml": "海洋天气短信",
+        "https://www.gd121.cn/fw/dx/8a14d310645ec72401645ed0327f1a25.shtml": "上下班天气短信",
+        "https://www.gd121.cn/fw/wx/ff8080816f16da59016f176859fa0019.shtml": "缤纷微天气",
+        "https://www.gd121.cn/fw/wttq/list.shtml": "微谈天气",
+        "https://www.gd121.cn/kp/fangyu.html": "广东省气象灾害防御法规",
+        "https://www.gd121.cn/fw/wttq/ff8080818a06a656019139b5d7d12c21.shtml": "高温要熄火？南方大城市高温打卡日历来了 看你那儿酷热何时迎转机",
+        "https://www.gd121.cn/fw/wttq/ff8080818a06a656019139b17e9c2c1d.shtml": "邂逅中式浪漫！全国七夕天气地图出炉 看哪里天公作美宜赏星河",
+        "https://www.gd121.cn/fw/wttq/ff8080818a06a656019134855c4d2c03.shtml": "热浪来袭，如何安然度夏",
+        "https://www.gd121.cn/fw/wttq/ff8080818a06a65601913482cf1b2bfe.shtml": "科普看台｜安全不“放假” 暑期出游避险指南",
+        "https://www.gd121.cn/fw/qxys/qxyslist.shtml": "气象影视",
+        "https://www.gd121.cn/video/index.shtmln=ws20241228": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241227": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241226": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241225": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241224": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241223": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241222": "广东卫视天气预报",
+        "https://www.gd121.cn/video/index.shtmln=ws20241221": "广东卫视天气预报",
+        "https://www.gd121.cn/fw/xsp/list.shtml": "显示屏",
+        "https://weibo.com/910620121topnav=1&wvr=6&topsug=1": "省微博",
+        "https://www.gd121.cn/fw/yjqxdh/list.shtml": "应急气象电话 应急气象电话",
+        "https://www.gd121.cn/fw/yjqxdh/8a14d310645ec72401645ecbde240eb7.shtml": "“报平安”功能使用步骤 [详情] “报平安”功能使用步骤 “报平安”功能使用步骤 [详情] “报平安”功能使用步骤",
+        "https://www.gd121.cn/fw/yjqxdh/8a14d310645ec72401645ecbdff00ebd.shtml": "“报平安”留言功能操作说明 “报平安”留言功能操作说明",
+        "https://www.gd121.cn/fw/yjqxdh/8a14d310645ec72401645ecbdf520ebb.shtml": "“报平安”留言功能介绍 “报平安”留言功能介绍",
+        "https://www.gd121.cn/fw/yjqxdh/8a14d310645ec72401645ecbdeba0eb9.shtml": "应急气象电话简介 应急气象电话简介",
+        "https://www.gd121.cn/fw/yjkhd/list.shtml": "应急客户端",
+        "https://www.gd121.cn/kp/rdzt/list.shtml": "热点专题",
+        "https://tqkp.gd121.cn/pc/index.php/index/index": "2022年5.12全国防灾减灾日§to_be_recognized_by_visual_llm_https://www.gd121.cn/upload/image/2022/05/12/1652320917678096536.jpg§",
+        "http://gd.weather.com.cn/zt/cbzt/3164789.shtml": "中国天然氧吧（龙门、揭西、连山、新兴）§to_be_recognized_by_visual_llm_https://www.gd121.cn/upload/image/2021/03/04/1614835718147071414.jpg§",
+        "https://www.gd121.cn/fw/zyfw/list.shtml": "专业服务",
+        "https://www.gd121.cn/fw/zyfw/8a14d310645ec72401645ecbe8050ed7.shtml": "气候可行性论证业务 [详情] 气候可行性论证业务 根据《中华人民共和国气象法》等有关法律、法规的规定",
+        "https://www.gd121.cn/fw/zyfw/ff8080816e163941016e44adce5c018e.shtml": "气象短信服务介绍",
+        "https://www.gd121.cn/fw/zyfw/8a14d310645ec72401645ecbe4820ecb.shtml": "气候预测与应用",
+        "https://www.gd121.cn/fw/zyfw/8a14d310645ec72401645ecbe6340ed1.shtml": "防雷设计施工图纸技术审查、防雷装置检测",
+        "https://www.gd121.cn/fw/zyfw/8a14d310645ec72401645ecbe5ac0ecf.shtml": "雷电灾害调查鉴定",
+        "http://weibo.com/gztqtopnav=1&wvr=6&topsug=1": "§to_be_recognized_by_visual_llm_https://www.gd121.cn/img/weibo/gztq.jpg§",
+        "http://weibo.com/u/2294193132topnav=1&wvr=6&topsug=1": "广州天气",
+        "http://weibo.com/dgweathertopnav=1&wvr=6&topsug=1": "东莞天气",
+        "http://weibo.com/u/2294123212topnav=1&wvr=6&topsug=1": "江门天气",
+        "http://weibo.com/u/1871802012topnav=1&wvr=6&topsug=1": "深圳天气",
+        "http://weibo.com/u/2132837731topnav=1&wvr=6&topsug=1": "云浮天气",
+        "http://weibo.com/u/2138854722topnav=1&wvr=6&topsug=1": "揭阳天气",
+        "http://weibo.com/u/2203884274topnav=1&wvr=6&topsug=1": "潮州天气",
+        "http://weibo.com/u/1950449202topnav=1&wvr=6&topsug=1": "清远天气",
+        "http://weibo.com/u/2204732440topnav=1&wvr=6&topsug=1": "梅州天气",
+        "http://weibo.com/u/2189531372topnav=1&wvr=6&topsug=1": "肇庆天气",
+        "http://weibo.com/u/2118035822topnav=1&wvr=6&topsug=1": "湛江天气",
+        "http://weibo.com/u/2109335980topnav=1&wvr=6&topsug=1": "茂名天气",
+        "http://weibo.com/u/2127837823topnav=1&wvr=6&topsug=1": "阳江天气",
+        "http://weibo.com/u/2097116807topnav=1&wvr=6&topsug=1": "佛山天气",
+        "http://weibo.com/u/1980708047topnav=1&wvr=6&topsug=1": "中山天气",
+        "http://weibo.com/u/2163977800topnav=1&wvr=6&topsug=1": "汕尾天气",
+        "http://weibo.com/u/2288166234topnav=1&wvr=6&topsug=1": "惠州天气",
+        "http://weibo.com/u/2296404962topnav=1&wvr=6&topsug=1": "河源天气",
+        "http://weibo.com/u/2196955610topnav=1&wvr=6&topsug=1": "珠海天气",
+        "http://weibo.com/u/1993619844topnav=1&wvr=6&topsug=1": "韶关天气",
+        "http://weibo.com/u/2296975054topnav=1&wvr=6&topsug=1": "汕头天气",
+        "https://www.gd121.cn/share/pluginwea.do": "天气插件",
+        "https://www.gd121.cn/share/interface.doid=1": "数据接口",
+        "https://www.gd121.cn/fw/index.shtml": "公众服务 服务渠道",
+        "https://www.gd121.cn/kp/yjkp/list.shtml": "应急科普",
+        "https://www.gd121.cn/kp/zcfg/list.shtml": "政策法规",
+        "https://www.gd121.cn/share/": "数据共享",
+        "http://bszs.conac.cn/sitenamemethod=show&id=5a4b4aa9c22f62bee053022e1aacade4": "§to_be_recognized_by_visual_llm_https://www.gd121.cn/img/blue.png§",
+        "https://www.gbaweather.net/tc/": "粤港澳大湾区天气网站 友情链接",
+        "http://www.beian.gov.cn/portal/registersysteminforecordcode=44010402001842": "粤公网安备 44010402001842号 ©广东省气象公共服务中心版权所有 | 技术支持：数鹏通（LinkCM）科技"
+    },
+    "text": "今天是2024年12月28日 登录/注册\n\n  §to_be_recognized_by_visual_llm_ttps://www.gd121.cn/site3/images/fw/xsp/2015/05/28/7df410ca49aefb9b561ecfbbff21ce87.jpg§  电子显示屏[Ref_1] 气象信息显示终端，以视频、声音、图片、文字等多媒体技术向公众发布气象预警、预报、实况等相关信息，为定...\n\n气象微博\n\n1. 1\n  2. 2\n\n广东气象微博群\n\n形式多样，强大的自定义功能，为您的网站量身定做个性化预警插件。\n\n我们会为您们提供最权威、最真实、最及时、最全面的数据服务与共享。\n\n广东省气象局官方客户端，提供全省停课信号、天气预警、天气查询等服务。\n\n微信小程序，提供基于位置的天气预报、预警服务。\n\n联系电话：020-87664716地址：广东省广州市天河区东莞庄路312号",
+    "text_link_map": {
+        "Ref_1": "https://www.gd121.cn/fw/xsp/8a14d310645ec72401645ed02adc1a0f.shtml",
+        "Ref_2": "https://www.gd121.cn/site3/images/fw/xsp/2015/05/28/7df410ca49aefb9b561ecfbbff21ce87.jpg"
+    }
+}
--- a/test/webpage_samples/888308/888308.json
+++ b/test/webpage_samples/888308/888308.json
--- a/test/webpage_samples/888308/sample.json
+++ b/test/webpage_samples/888308/sample.json
@ -0,0 +1,50 @@
+{
+    "link_dict": {
+        "https://www.gd121.cn/index.shtml": "首页 * \n  * \n    * ;)",
+        "https://www.gd121.cn/yj/index.shtml": "预警 * \n  * \n    * ;) 预警信息",
+        "https://www.gd121.cn/tq/jdtq_list.shtml": "旅游天气",
+        "https://www.gd121.cn/tq/jccz/list.shtml": "机场&车站",
+        "https://www.gd121.cn/tq/gqtj/index.shtml": "高清图集",
+        "https://www.gd121.cn/hd/yhfk/list.shtml": "用户反馈",
+        "https://www.gd121.cn/hd/sjsc/index.shtml": "实景上传",
+        "http://passport.gd121.cn/ucy_center/index": "用户中心",
+        "https://www.gd121.cn/zx/zhxx/list.shtml": "综合消息 资讯",
+        "https://www.gd121.cn/zx/zhxx/ff8080817a75a9a2017abcdfd97f01bd.shtml": "号台风“查帕卡”已生成 广东沿海有风雨 <<上一篇",
+        "https://www.gd121.cn/zx/zhxx/ff808081706d374701716c94668c062a.shtml": "日广东大部天晴 早晚寒凉 下一篇",
+        "https://www.gd121.cn/zx/yjxy/list.shtml": "更多>> 应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edada382a17.shtml": "广东省气象局结束气象灾害（暴雨）Ⅳ级应急响应§to_be_recognized_by_visual_llm_https://www.gd121.cn/site3/images/zx/yjxy/2018/06/15/d0516ee2248b72cc40a4d9a067acd63e.jpg§ [详情] 广东省气象局结束气象灾害（暴雨）Ⅳ级应急响应 6月12日到14日，珠江三角洲市县、粤西大部分市县出现了暴雨到大暴雨局部特大暴雨。预计，15日",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edadbb42a19.shtml": "广东省气象局启动气象灾害（暴雨）Ⅳ级... 广东省气象局启动气象灾害（暴雨）Ⅳ级应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edadd2e2a1b.shtml": "广东省气象局结束气象灾害（暴雨）Ⅱ级... 广东省气象局结束气象灾害（暴雨）Ⅱ级应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edade922a1d.shtml": "广东省气象局结束气象灾害（台风）Ⅲ级... 广东省气象局结束气象灾害（台风）Ⅲ级应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edadfff2a1f.shtml": "广东省气象局启动气象灾害（暴雨）Ⅱ级... 广东省气象局启动气象灾害（暴雨）Ⅱ级应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edae2d42a23.shtml": "广东省气象局升级气象灾害（台风）应急... 广东省气象局升级气象灾害（台风）应急响应为Ⅲ级",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edae16f2a21.shtml": "广东省气象局启动气象灾害（台风）Ⅳ级... 广东省气象局启动气象灾害（台风）Ⅳ级应急响应",
+        "https://www.gd121.cn/zx/yjxy/8a14d310645ec72401645edae4362a25.shtml": "广东省气象局结束气象灾害（寒冷）Ⅲ级... 广东省气象局结束气象灾害（寒冷）Ⅲ级应急响应",
+        "https://www.gd121.cn/kp/rdzt/list.shtml": "更多>> 热点专题",
+        "https://tqkp.gd121.cn/pc/index.php/index/index": "2022年5.12全国防灾减灾日§to_be_recognized_by_visual_llm_https://www.gd121.cn/upload/image/2022/05/12/1652320917678096536.jpg§",
+        "http://gd.weather.com.cn/zt/cbzt/3164789.shtml": "中国天然氧吧（龙门、揭西、连山、新兴）§to_be_recognized_by_visual_llm_https://www.gd121.cn/upload/image/2021/03/04/1614835718147071414.jpg§",
+        "https://www.gd121.cn/kp/yjkp/list.shtml": "更多>> 应急科普 应急科普",
+        "https://www.gd121.cn/kp/yjkp/yjcs/zrzhsjcs/8a14d310645ec72401645ec9bdb20626.shtml": "都市中遇到洪水怎么办§to_be_recognized_by_visual_llm_https://www.gd121.cn/site3/images/kp/yjkp/yjcs/zrzhsjcs/2015/05/27/0c9363b3a37c43ac789b4b56664b2832.png§ 都市中遇到洪水怎么办",
+        "https://www.gd121.cn/kp/yjkp/yjcs/zrzhsjcs/list.shtml": "自然灾害",
+        "https://www.gd121.cn/kp/yjkp/yjcs/ggwssjcs/8a14d310645ec72401645ec9c64f065c.shtml": "中国疾控中心提示：春节期间应做好传染病防御§to_be_recognized_by_visual_llm_https://www.gd121.cn/img/placeholder.png§ 中国疾控中心提示：春节期间应做好传染病防御",
+        "https://www.gd121.cn/kp/yjkp/yjcs/ggwssjcs/list.shtml": "公共卫生",
+        "https://www.gd121.cn/kp/yjkp/yjcs/shaqsjcs/8a14d310645ec72401645ec9c994066e.shtml": "购买、使用消防产品不容马虎§to_be_recognized_by_visual_llm_https://www.gd121.cn/img/placeholder.png§ 购买、使用消防产品不容马虎",
+        "https://www.gd121.cn/kp/yjkp/yjcs/shaqsjcs/list.shtml": "社会安全",
+        "https://www.gd121.cn/kp/yjkp/yjxh/list.shtml": "预警信号",
+        "https://www.gd121.cn/kp/yjkp/yjxh/8a14d310645ec72401645ec9c9f70670.shtml": "森林火险预警信号",
+        "https://www.gd121.cn/share/pluginwea.do": "天气插件",
+        "https://www.gd121.cn/share/interface.doid=1": "数据接口",
+        "https://www.gd121.cn/fw/yjkhd/8a14d310645ec72401645ecbea730edb.shtml": "停课铃APP",
+        "https://www.gd121.cn/fw/wx/ff8080816f16da59016f176859fa0019.shtml": "缤纷微天气",
+        "https://www.gd121.cn/fw/index.shtml": "公众服务 服务渠道",
+        "https://www.gd121.cn/kp/zcfg/list.shtml": "政策法规",
+        "https://www.gd121.cn/share/": "数据共享",
+        "http://bszs.conac.cn/sitenamemethod=show&id=5a4b4aa9c22f62bee053022e1aacade4": "§to_be_recognized_by_visual_llm_https://www.gd121.cn/img/blue.png§",
+        "https://www.gbaweather.net/tc/": "粤港澳大湾区天气网站 友情链接"
+    },
+    "text": "今天是2024年12月28日 登录/注册\n\n18-20日广东多云有雾 21-24日阴雨寒凉\n\n广东省GD121 |  2020-04-18\n\n17日白天到18日早晨，广东西部市县多云间阴天局部有小雨，东部市县晴天到多云，部分市县早晚出现了轻雾。\n\n预计，18-20日广东多云为主，其中，清远、韶关、肇庆大部市县有（雷）阵雨局部大雨；全省温湿维持在较高水平，早晚有（轻）雾，能见度偏低。21-24日我省持续受冷空气影响，日平均气温将逐步下降4～6℃，全省转阴雨寒凉天气；其中，22-24日粤西、粤北和珠江三角洲大部市县有中到大雨，部分市县暴雨降水，雷雨时局地伴有8级左右短时大风。\n\n具体预报如下：\n\n18-19日，清远、韶关、肇庆有（雷）阵雨局部大雨和7级左右雷雨大风，其余市县多云间阴天，局部有（雷）阵雨。早晨最低气温：粤西大部市县21～24℃，其余市县17℃～21℃。\n\n20-21日，粤北大部有中（雷）雨局部大雨到暴雨，雷雨时局地伴有8级左右短时大风，其余市县多云到阴天局部有（雷）阵雨。21日粤北市县气温小幅下降。\n\n22-24日，粤西、粤北和珠江三角洲大部市县有中到大雨，部分市县暴雨降水，雷雨时局地伴有8级左右短时大风，其余市县有（雷）阵雨局部大雨。全省气温明显下降。\n\n气象专家提醒，早晚有（轻）雾，雾时能见度较低，请注意交通安全。\n\n文章关键字:\n\n用户评论：\n\n匿名发表评论\n\n最新评论\n\n暂无评论，快来评论吧\n\n查看更多评论>>\n\n广东省气象局结束气象灾害（暴雨...\n\n1. 1\n  2. 2\n\n形式多样，强大的自定义功能，为您的网站量身定做个性化预警插件。\n\n我们会为您们提供最权威、最真实、最及时、最全面的数据服务与共享。\n\n广东省气象局官方客户端，提供全省停课信号、天气预警、天气查询等服务。\n\n微信小程序，提供基于位置的天气预报、预警服务。\n\n联系电话：020-87664716地址：广东省广州市天河区东莞庄路312号\n\n 粤ICP备05011356号[Ref_1]  粤公网安备 44010402001842号[Ref_2] ©广东省气象公共服务中心版权所有 | 技术支持：数鹏通（LinkCM）科技",
+    "text_link_map": {
+        "Ref_1": "http://beian.miit.gov.cn",
+        "Ref_2": "http://www.beian.gov.cn/portal/registersysteminforecordcode=44010402001842"
+    }
+}
--- a/test/webpage_samples/ae2d03/ae2d03.json
+++ b/test/webpage_samples/ae2d03/ae2d03.json
--- a/test/webpage_samples/ae2d03/sample.json
+++ b/test/webpage_samples/ae2d03/sample.json
@ -0,0 +1,59 @@
+{
+    "link_dict": {
+        "https://news.bjx.com.cn/spdfl/": "输配电",
+        "https://news.bjx.com.cn/dlrjst/": "电力软件",
+        "https://news.bjx.com.cn/tgj/": "碳管家",
+        "https://news.bjx.com.cn/zhnyfw/": "综合能源服务",
+        "https://m.bjx.com.cn/xnjt/": "氢能交通",
+        "https://m.bjx.com.cn/jqz/": "加氢站",
+        "https://pubapinews.bjx.com.cn/kehuclickid=18438": "讲师招募！寻找光伏、风电、储能、环保行业讲师",
+        "https://news.bjx.com.cn/topics/jiaqingzhan/": "加氢站",
+        "https://m.bjx.com.cn/mnews/20241226/1419426.shtml": "内蒙古伊金霍洛旗这些加氢站项目建设快速推进蒙西正和集团前天",
+        "https://m.bjx.com.cn/mnews/20241225/1419055.shtml": "美锦能源：子公司飞驰科技与佛山市在氢能产业方面保持紧密合作北极星氢能网12月25日",
+        "https://m.bjx.com.cn/mnews/20241225/1419013.shtml": "中国华电采购一座撬装加氢站中国华电集团12月25日",
+        "https://m.bjx.com.cn/mnews/20241224/1418915.shtml": "突发！韩国忠州氢能公交车爆炸！北极星氢能网12月24日",
+        "https://m.bjx.com.cn/mnews/20241224/1418735.shtml": "内蒙古包头白云鄂博矿铁矿区加氢站项目正式启动！申能能创12月24日",
+        "https://m.bjx.com.cn/mnews/20241223/1418350.shtml": "天津：到2027年，规划加氢站35座！天津市城市管理委员会12月23日",
+        "https://m.bjx.com.cn/mnews/20241218/1417657.shtml": "内蒙古鄂尔多斯：电解水制氢项目可不进化工园区鄂尔多斯人大12月18日",
+        "https://m.bjx.com.cn/mnews/20241218/1417439.shtml": "北京支持氢能产业发展：加氢站给予10元/公斤氢气运营补贴！北京市经济和信息化局12月18日 北京支持氢能产业发展：加氢站给予10元/公斤氢气运营补贴！北京市经济和信息化局12月18日",
+        "https://m.bjx.com.cn/mnews/20241216/1416942.shtml": "加氢站运营补贴150万元/年！浙江临海市氢能产业发展扶持政策征意见临海市人民政府12月16日",
+        "https://m.bjx.com.cn/mnews/20241212/1416532.shtml": "共98.23万元！大连公示2023年加氢站补贴名单及资金分配方案大连市发展和改革委员会12月12日",
+        "https://m.bjx.com.cn/mnews/20241225/1419089.shtml": "大冶特钢已建成氢能加热炉北极星氢能网12月25日",
+        "https://m.bjx.com.cn/mnews/20241225/1419020.shtml": "美国能源部公布最新氢能计划北极星氢能网12月25日",
+        "https://m.bjx.com.cn/mnews/20241224/1418861.shtml": "套氢能装备！2024年苏锡常首台（套）重大装备名单公示苏州市工业和信息化局12月24日",
+        "https://m.bjx.com.cn/mnews/20241223/1418491.shtml": "万标方/小时！PSA氢气提纯装置工程EPC总承包一标段招标！中国招标投标公共服务平台12月23日",
+        "https://m.bjx.com.cn/mnews/20241220/1418153.shtml": "燕山石化成为华北最大燃料电池氢供应基地！中国石化12月20日",
+        "https://m.bjx.com.cn/mnews/20241218/1417689.shtml": "韩国最大蓝氢项目遭抵制！铂道12月18日",
+        "https://m.bjx.com.cn/mnews/20241218/1417568.shtml": "世界最大输量绿氢管道年底前完成勘察设计！计划2026年底建成投运曹妃甸发布12月18日",
+        "https://m.bjx.com.cn/mnews/20241216/1417085.shtml": "中集安瑞科与香港中华煤气签约 共同推动香港氢能发展中集安瑞科12月16日",
+        "https://m.bjx.com.cn/mnews/20241213/1416717.shtml": "超180吨，中原油田完成年度绿氢生产目标中石化中原油田12月13日",
+        "https://m.bjx.com.cn/mnews/20241227/1419674.shtml": "羚牛氢能与泰达控股签订战略合作协议 共建“天津氢港”！羚牛氢能昨天",
+        "https://m.bjx.com.cn/mnews/20241227/1419671.shtml": "中国石化资本投资入股徐工汽车！中国石化昨天 中国石化资本投资入股徐工汽车！",
+        "https://m.bjx.com.cn/mnews/20241227/1419616.shtml": "辆氢能重卡三期采购项目中标候选人公示！中国招标投标公共服务平台昨天",
+        "https://m.bjx.com.cn/mnews/20241227/1419610.shtml": "中石化海水电解制氢中试装置招标！中国招标投标公共服务平台昨天",
+        "https://m.bjx.com.cn/mnews/20241227/1419541.shtml": "中能建氢能公司改革新步伐收录国资委《千帆竞渡》！能建氢能昨天",
+        "https://m.bjx.com.cn/mnews/20241227/1419527.shtml": "中国能建华北院中标鄂尔多斯市绿氢与煤化工耦合碳减排创新示范项目！中国能建昨天",
+        "https://m.bjx.com.cn/mnews/20241227/1419508.shtml": "又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付!中国船舶712所昨天 又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付",
+        "https://m.bjx.com.cn/mnews/20241226/1419475.shtml": "氢燃料电池商用车整车项目签约落户西安！北极星氢能网前天",
+        "https://m.bjx.com.cn/mnews/20241226/1419473.shtml": "内蒙古达茂旗政府与国富氢能签约！北极星氢能网前天",
+        "https://m.bjx.com.cn/mnews/20241226/1419471.shtml": "辆氢能源环卫作业车辆租赁项目中标结果公示！中国政府采购网前天",
+        "https://m.bjx.com.cn/mnews/20241226/1419453.shtml": "阳光氢能开启电解槽“数智”蝶变阳光氢能科技前天",
+        "https://m.bjx.com.cn/mnews/20241227/1419591.shtml": "华电取得首个绿氨掺烧项目！",
+        "https://m.bjx.com.cn/mnews/20241227/1419599.shtml": "全球最大体量绿色氢氨醇一体化建设项目2.87亿贷款落地！",
+        "https://m.bjx.com.cn/mnews/20241227/1419727.shtml": "绿色燃料，未来已来！",
+        "https://dljob.bjx.com.cn/specials/101817.html": "新一线城市25届校招专场",
+        "https://dljob.bjx.com.cn/specials/101814.html": "研后大型专场招聘会",
+        "https://hr.bjx.com.cn/companys/9376.html": "中广核新能源投资（深圳）有限公司北京分公司",
+        "https://hr.bjx.com.cn/companys/61186.html": "中国能源建设集团科技发展有限公司",
+        "https://hr.bjx.com.cn/companys/60585.html": "广州发展新能源集团股份有限公司",
+        "https://hr.bjx.com.cn/companys/1504.html": "北极星招聘猎头",
+        "https://hr.bjx.com.cn/companys/39374.html": "广东省电力开发有限公司",
+        "https://edu.bjx.com.cn/s/67109308.html": "集中式光伏项目全流程支持性文件概述",
+        "https://edu.bjx.com.cn/s/67109294.html": "储能电站安全运维管理方案",
+        "https://edu.bjx.com.cn/s/67109353.html": "锂离子电池储能中的安全问题及应对技术（限时免费）",
+        "https://edu.bjx.com.cn/s/67109396.html": "光伏电站股权收购的流程和风险分析",
+        "https://edu.bjx.com.cn/s/67109299.html": "《零碳生态&绿色未来》系列公益公开课"
+    },
+    "text": "## 北极星氢能网获悉，近日，融通汽车集团能源事业部在其首个制氢加氢站项目建设工作中取得重大突破。该项目采用领先技术，极大降低了操作风险与设备维护难度，具有高效、环保、氢气纯度高的优势，可有效提升制氢效率与质量。同时，加氢环节运用智能精确计量与控制系统，能根据不同车辆需求精准加氢，保障\n\n##### 行业选择\n\n# 25元/公斤！湖北武汉这一加氢站即将正式投运\n\n2024-12-26 16:07 来源:融通汽车\n\n北极星氢能网获悉，近日，融通汽车集团能源事业部在其首个制氢加氢站项目建设工作中取得重大突破。该项目采用领先技术，极大降低了操作风险与设备维护难度，具有高效、环保、氢气纯度高的优势，可有效提升制氢效率与质量。同时，加氢环节运用智能精确计量与控制系统，能根据不同车辆需求精准加氢，保障了加氢过程的安全与高效运营，大力推动氢能源于交通运输领域的广泛应用，助力节能减排。\n\n在加氢站投入运营后，将提供极具竞争力的加氢价格，仅为25元/公斤，这意味着武汉市正式进入25元氢气时代，从而有力地促进氢能源车辆的销售与普及，推动氢能源在交通领域的广泛应用，为节能环保事业作出突出贡献。\n\n目前制氢装置和加氢设施的基础施工已经基本完成，，项目区域布置有先进的钻探设备和临时指挥设施，施工现场秩序井然，勘探工作高效展开。在过去几周内，施工团队克服了复杂的地质条件及其他不确定因素，完成了多轮地质勘探与评估，为加氢站的地基建设和设备安装提供了科学依据。\n\n据悉，该项目即将正式投入运营，届时将助力加速湖北省氢能源网络布局，为氢能源行业的规模化应用打下坚实基础。未来，融通（湖北）能源科技有限责任公司将继续深化绿色能源技术研发与应用，打造更多优质项目，为双碳目标实现贡献。\n\n特别声明：北极星转载其他网站内容，出于传递更多信息而非盈利之目的，同时并不代表赞成其观点或证实其描述，内容仅供参考。版权归原作者所有，若有侵权，请联系我们删除。凡来源注明北极星*网的内容为北极星原创，转载需获授权。\n\n展开全文\n\n打开北极星学社APP，阅读体验更佳\n\n0\n\n收藏\n\n#### 打开北极星学社APP查看更多相关报道\n\n* 加氢站\n  * 氢气\n  * 氢能\n\n####\n\n今日\n    本周\n    本月\n新闻排行榜\n\n打开北极星学社APP，阅读体验更佳\n\n#### 打开北极星学社APP查看更多招聘\n\n绩效奖金带薪年假定期体检\n\n#### 打开北极星学社APP查看更多学社\n\n_* 点击空白区域关闭图片,双指拖动可放大图片,单指拖动可移动图片哦_",
+    "text_link_map": {}
+}
--- a/test/webpage_samples/bd6dae/bd6dae.json
+++ b/test/webpage_samples/bd6dae/bd6dae.json
--- a/test/webpage_samples/bd6dae/sample.json
+++ b/test/webpage_samples/bd6dae/sample.json
@ -0,0 +1,92 @@
+{
+    "link_dict": {
+        "https://cryptopanic.comfilter=rising": "Rising",
+        "https://cryptopanic.com/news/media": "Media Media Media Feed Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/polls/new": "Polls",
+        "https://cryptopanic.com/portfolio": "Portfolio Portfolio Tracker Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/pro": "Get PRO Become PRO",
+        "https://cryptopanic.com/accounts/login/": "Sign In Login Log in",
+        "https://cryptopanic.com/accounts/signup/": "Sign Up Sign Up",
+        "https://cryptopanic.com/pro/": "Go PRO Go PRO",
+        "https://cryptopanic.com/developers/": "Develop Developers Developers",
+        "https://cryptopanic.com/developers/api/about": "API",
+        "https://cryptopanic.com/developers/bots/": "Bots",
+        "https://cryptopanic.com/developers/widgets/": "Widgets",
+        "https://cryptopanic.com/careers/": "Careers Careers",
+        "https://cryptopanic.com/guides": "Guides",
+        "https://cryptopanic.com/partnership-hub": "Bizz Hub",
+        "https://cryptopanic.com/submit-source": "Submit",
+        "https://cryptopanic.com/news": "Top News Show all",
+        "https://cryptopanic.com/news/all": "All News",
+        "https://cryptopanic.com/news/blogs": "Blogs Blogs Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/news-sites": "News Sites",
+        "https://cryptopanic.com/news/following": "Following",
+        "https://cryptopanic.com/news/price-analysis": "Price Analysis",
+        "https://cryptopanic.com/news/regulation": "Regulation",
+        "https://cryptopanic.com/news/ico-news": "ICO News",
+        "https://cryptopanic.com/news/events": "Events",
+        "https://cryptopanic.com/newsfilter=commented": "Commented Recent Comments Show more",
+        "https://cryptopanic.com/newsfilter=hot": "Hot",
+        "https://cryptopanic.com/newsfilter=rising": "Rising Trending Show more",
+        "https://cryptopanic.com/newsfilter=bullish": "Bullish",
+        "https://cryptopanic.com/newsfilter=bearish": "Bearish",
+        "https://cryptopanic.com/newsfilter=important": "Important",
+        "https://cryptopanic.com/newsfilter=lol": "LOL",
+        "https://cryptopanic.com/newsfilter=saved": "Top Saved",
+        "https://cryptopanic.com/news/about": "About",
+        "https://cryptopanic.com/news/20482034/ripples-rlusd-stablecoin-explodes-with-106-growth-in-volumes-details": "Ripple's RLUSD Stablecoin Explodes With 106% Growth in Volumes: Details u.today",
+        "https://cryptopanic.com/news/ripple/": "XRP",
+        "https://cryptopanic.com/news/ripple-usd/": "RLUSD",
+        "https://cryptopanic.com/news/20477745/the-united-states-has-regained-leadership-in-the-cryptocurrency-market-what-does-this-indicate": "The United States has Regained Leadership in the Cryptocurrency Market. What Does This Indicate? spaziocrypto.com",
+        "https://cryptopanic.com/news/20477696/will-sam-bankman-fried-get-a-presidential-pardon-from-biden": "Will Sam Bankman-Fried Get a Presidential Pardon from Biden? coinpaprika.com",
+        "https://cryptopanic.com/news/20477579/bitget-merges-tokens-as-bgb-hits-record-high-amid-market-downturn": "Bitget Merges Tokens as BGB Hits Record High Amid Market Downturn coinpaprika.com",
+        "https://cryptopanic.com/news/bitget-token/": "BGB Bitget Token Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/bitget-wallet-token/": "BWB",
+        "https://cryptopanic.com/news/20477541/vitalik-buterin-donates-88-eth-to-adopt-moo-deng": "Vitalik Buterin Donates 88 ETH to Adopt Moo Deng coinpaprika.com",
+        "https://cryptopanic.com/news/ethereum/": "ETH ETH",
+        "https://cryptopanic.com/news/bridged-wrapped-ether-starkgate/": "ETH",
+        "https://cryptopanic.com/news/moo-deng/": "MOODENG",
+        "https://cryptopanic.com/news/20476318/sec-may-shift-approach-to-crypto-under-new-leadership": "SEC May Shift Approach to Crypto Under New Leadership coinpaprika.com",
+        "https://cryptopanic.com/news/20471491/bitcoin-etfs-face-12b-outflows-ethereum-sees-130m-inflows": "Bitcoin ETFs Face $1.2B Outflows, Ethereum Sees $130M Inflows coinpaprika.com",
+        "https://cryptopanic.com/news/bitcoin/": "BTC",
+        "https://cryptopanic.com/polls": "Trending Poll Show more",
+        "https://cryptopanic.com/polls/20366587/will-bitcoin-pump-because-donald-trumps-presidential-inauguration": "Will bitcoin pump because Donald trump's presidential inauguration? ###  3w ago by Vote  268 votes",
+        "https://cryptopanic.com/47_m.naufal/": "m.naufal ###  3w ago by",
+        "https://cryptopanic.com/news/virtual-protocol/": "Virtuals Protocol Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/hedera-hashgraph/": "Hedera Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/solana/": "Solana Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/cardano/": "Cardano Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/sui/": "Sui Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/dogecoin/": "Dogecoin Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/mantra-dao/": "MANTRA Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/pepe/": "Pepe Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/near/": "NEAR Protocol Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/polkadot/": "Polkadot Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/leo-token/": "LEO Token Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/monero/": "Monero Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/bittensor/": "Bittensor Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/the-open-network/": "Toncoin Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/uniswap/": "Uniswap Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/dai/": "Dai Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/crypto-com-chain/": "Cronos Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/usd-coin/": "USDC Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/news/tether/": "Tether Coin | % 24h | % 1h | Price | Volume 24h |  VIRTUAL | 5.35%  | -1.33%  | $3.59 | $472,728,350  \n---|---|---|---|---  \n HBAR | 5.16%  | -0.39%  | $0.288 | $764,501,368  \n SOL | 4.67%  | -0.18%  | $195.74 | $2,817,754,278  \n ADA | 3.46%  | 0.33%  | $0.900 | $600,788,037  \n SUI | 2.84%  | -0.73%  | $4.17 | $736,871,770  \n DOGE | 2.45%  | -0.29%  | $0.325 | $1,576,987,724  \n OM | 2.45%  | -0.18%  | $3.79 | $32,661,724  \n PEPE | 1.98%  | -2.08%  | $0.000 | $1,272,026,956  \n NEAR | 1.81%  | -0.41%  | $5.26 | $242,717,031  \n DOT | 1.78%  | -0.59%  | $7.00 | $219,423,524  \n BGB | -11.21% | -6.31%  | $6.72 | $7,963,755  \n LEO | -2.57% | 0.15%  | $9.05 | $830,479  \n XMR | -1.77% | -0.73%  | $195.93 | $35,019,088  \n TAO | -0.70% | -1.35%  | $461.09 | $80,023,037  \n TON | -0.64% | -0.77%  | $5.74 | $164,208,788  \n UNI | -0.26% | -0.76%  | $13.23 | $271,392,720  \n DAI | -0.04% | -0.08%  | $1.000 | $120,874,342  \n CRO | -0.03% | -0.56%  | $0.147 | $19,667,290  \n USDC | -0.03% | -0.03%  | $1.000 | $3,814,410,360  \n USDT | -0.02% | -0.04%  | $0.998 | $24,369,646,432  \n  \n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes ,  and",
+        "https://cryptopanic.com/advertise/": "Advertise Advertise",
+        "https://cryptopanic.com/about/": "About About",
+        "https://cryptopanic.com/contact/": "Contact Contact",
+        "https://cryptopanic.com/terms/": "Terms Terms",
+        "https://cryptopanic.com/guides/": "Guides",
+        "https://cryptopanic.com/partnership-hub/": "Bizz Hub"
+    },
+    "text": "Region\n\n* News Feeds\n  * English \n  * Deutsch \n  * Dutch \n  * Español \n  * Français \n  * Italiano \n  * Português \n  * Русский \n  * Türkçe \n  * عربي \n  * 中國人 \n  * 日本 \n  * 한국인\n\n* UI Language\n  * English\n\nTop News\n\nCategory\n\nShow All\n\nPanic Score\n\nWhat is PanicScore?\n\nPanic Score highlights the most important news everyone’s talking about.\n\nHow does it work?\n\nIt analyzes millions of data points from various sources on the internet to measure engagement, including reading, sharing, and discussions. This information is compared within a specific time frame to calculate the Panic Score, highlighting market-moving news.\n\nWhat's so special about it?\n\nPanic Score simplifies your news consumption by providing a clear, easy-to-understand number that reflects how much attention a news story gets. The higher the Panic Score, the more people are talking about it. You instantly know what’s trending and important. This way, you stay informed without the hassle of sifting through endless articles, ensuring you stay in the loop and ahead of the curve.\n\nGet PanicScore Now\n\nPanic Score\n\n**CryptoPanic** is a news aggregator platform indicating impact on price and market for traders and cryptocurrency enthusiasts. Now includes Portfolio Tracker[Ref_1], Media Feed[Ref_2] and Blogs[Ref_3].\n\nShow Price alerts\n\nShow Following feed in Top News\n\nu.today\n\n4  3  2  2  4  2  2  1\n\nspaziocrypto.com\n\n8  4  1  5  4  3\n\ncoinpaprika.com\n\n9  8  6  4  6  5  4  1\n\ncoinpaprika.com\n\n6  6  1  6  4  1\n\ncoinpaprika.com\n\n4  2  1  4  3  1\n\ncoinpaprika.com\n\n8  6  1  7  5\n\ncoinpaprika.com\n\n11  9  2  11  6  1  1\n\n10min\n\ncrypto2011 Not to own 1 whole btc. “Irresponsible Not to Own Bitcoin,” Says Semler Scientific Chair\n\n10min\n\ncrypto2011 expert... Expert warns Bitcoin’ party is over’, expect a crash to $18,000\n\n41min\n\nBitbite No longer astonishing. Possibly just 2 cycles away. Arthur Hayes Predicts Bitcoin Could Reach $1 Million Amid Global Money Printing Surge\n\n9h\n\nkunipferd Me neither. A lot of times when something is being told to sell in vast amounts i can't find anyone who actually bought it. Makes me always... $PENGU Airdrop: Sellers Exit, Holders Remain Bullish\n\n11h\n\ntjsop enjoy your last moments Will Ethereum Hit 10K? Examining ETH’s Potential\n\n11h\n\ntcoast89 [url8] Will Ethereum Hit 10K? Examining ETH’s Potential\n\n11h\n\ntjsop useless soon zero Will Ethereum Hit 10K? Examining ETH’s Potential\n\n12h\n\nnestakasa This guyyyy... Ripple's RLUSD Stablecoin Explodes With 106% Growth in Volumes: Details\n\n15h\n\nm155 BTC back above 95k. Weak hands are out, time to buy everything. Let the idiots stay poor. Crypto Market Remains Greedy Despite Bitcoin Price Crash To $94,000, Is A Recovery Coming?\n\n17h\n\nuser234 \"Shrinked?\" XRP Open Interest Shrinked by $1 Billion in 24 Hours: Here’s Why\n\nYes that will happen\n\nNo,that wont happen\n\nGainers & Losers (from top 50)\n\nShow Price alerts\n\nShow Following feed in Top News\n\n### Hints\n\n* Follow currencies to customize your feed.\n  * Search or click on currency to view individual feeds.\n  * Navigate through news using UP/DOWN or K/J keys. Press ENTER to open link.\n  * You can use \"Add to Home Screen\" feature on your mobile app.\n  * List is being reloaded automatically.\n  * Filters explained: Top news = News Sites + Top Reddit & Twitter. Following = All news for the currencies you are following.\n  * Mobile apps:\n\n### Support CryptoPanic\n\nIf you like this project and find it useful, consider supporting further development in one of the following ways:\n\n* Choose PRO Account[Ref_4] and unlock all the features. \n  * Make a cryptocurrency donation[Ref_5].\n  * Share and tell your friends about it:\n\n### Community\n\nJoin our Discord[Ref_6] or Telegram[Ref_7] communities to discuss or suggest new features, get to know about updates and be first who sees newest features or chat about cryptocurrency space or learn more about trading.\n\n© CryptoPanic.com\n\n* Region\n\n* News Feeds\n    * English \n    * Deutsch \n    * Dutch \n    * Español \n    * Français \n    * Italiano \n    * Português \n    * Русский \n    * Türkçe \n    * عربي \n    * 中國人 \n    * 日本 \n    * 한국인 \n    * UI Language\n    * English\n\n0",
+    "text_link_map": {
+        "Ref_1": "https://cryptopanic.com/portfolio",
+        "Ref_2": "https://cryptopanic.com/news/media",
+        "Ref_3": "https://cryptopanic.com/news/blogs",
+        "Ref_4": "https://cryptopanic.com/pro/",
+        "Ref_5": "https://cryptopanic.com/donate/",
+        "Ref_6": "https://cryptopanic.com/discord/",
+        "Ref_7": "https://t.me/cryptopaniccom",
+        "url8": "https://media.giphy.com/media/UmGSoxNyOtE3wudNGF/giphy.gif"
+    }
+}
--- a/test/webpage_samples/c10c75/c10c75.json
+++ b/test/webpage_samples/c10c75/c10c75.json
--- a/test/webpage_samples/c10c75/sample.json
+++ b/test/webpage_samples/c10c75/sample.json
@ -0,0 +1,5 @@
+{
+    "link_dict": {},
+    "text": "16\n\n实况\n\n实况\n\n实况\n\n实况\n\n实况\n\n实况\n\n16\n\n1. 1\n  2. 2\n  3. 3\n  4. 4\n  5. 5\n  6. 6\n\n上海发布\n\n# 【名镇】如果可以，在濮院镇见一面吧！\n\n濮院镇，位于浙江省嘉兴市桐乡市东部，曾因“日出万匹绸”而成为“嘉禾一巨镇”，是明清时期江南五大名镇之一。 沿着庙桥河等小河蜿蜒而上，映入眼帘的便是九座与流水交织的精致古石桥。说起濮院古石桥，最先要说的必定是“栖凤桥”，古时山东人濮凤将先进的养蚕和缫丝技术带到这里，带动了四邻八乡养蚕、缫丝，借助运河将绸缎运往全国各地，“濮绸”自此闻名遐迩。濮凤在此定居后，建造了一座石梁桥——栖凤桥，“栖凤”其实饱含深意，当时的濮院镇有大片梧桐林，凤凰栖息在梧桐树上，这正暗合了濮凤的心意——意思是凤凰在这里栖息，他也在这里定居了。如今，栖凤桥依然坚固耐用，承载着历史的厚重，见证着时代的变迁。而濮院有故事的桥，并不止这一座，就等你亲临这里，慢慢探索吧~ 走过承载历史的古桥，不妨到焕发新颜的濮院时尚古镇走走逛逛吧~ 通过对历史建筑的保护性修复和有机更新，这里再现北更楼、岳家大院、福善寺等极具濮院历史特色的建筑，保留了江南建筑格局和特色，进一步唤醒古镇美景。除了重现街巷风貌，古镇还深入挖掘“濮商文化”，还原了濮商会馆以及陈记米行、定泉桥茶楼等老业态商铺，虽然现在做生意方式和以前大不相同，但传承下来的那份匠心和精神依然存在。在传承历史的同时，古镇也注入了现代时尚元素，以“中国时尚古镇”为总体定位，打造濮院时装周、濮院国风大典等“古镇秀场”IP，吸引游客一览濮院风貌。 穿过历史和现代，来红旗漾村舒展身心吧！这里的“杉林部落”齐刷刷地换上了红装。都说秋冬季节，是大自然打翻的调色盘，红旗漾村的红杉林势必是最浓烈的那幅油画，酝酿了一整年的盛情化为一排排、一棵棵高耸入云的红杉，交织层叠出一片红橙黄绿的斑斓色彩。这两年，“杉林部落”附近增设了景观喷泉、景观浮岛，将田园打造成农户家门口的“口袋公园”，成了观光休闲的好去处。当你漫步其中，任由层林尽染的杉林映入眼帘，再多的愁绪和浮躁也都能消解一空。既有历史痕迹的精心雕琢，也有现代气息的时尚装扮，还有江南水乡的唯美雅致，这般韵味的濮院，怎能不令人心向往之？ 资料：嘉兴市政府门户网站、嘉兴发布、桐乡市政府门户网站、桐乡发布、中国丝绸博物馆编辑：耿浩\n\n关闭\n\n更多\n\n名称已清空\n\n**微信扫一扫赞赏作者**\n\n文章\n\n暂无文章\n\n喜欢作者\n\n其它金额\n\n¥\n\n最低赞赏 ¥0\n\n确定\n\n返回\n\n**其它金额**\n\n更多\n\n赞赏金额\n\n¥\n\n最低赞赏 ¥0\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n0\n\n.\n\n,\n\n2024年12月28日 14:29 , , 上海\n\n暂无留言\n\n已无更多数据\n\n写留言:\n\n关闭\n\n**写留言**\n\n提交更多\n\n上海发布\n\n关注\n\n赞分享在看 写留言\n\n**留言**\n\n暂无留言\n\n已无更多数据\n\n写留言:\n\n关闭\n\n**写留言**\n\n提交更多\n\n关闭\n\n**0个朋友**\n\n更多\n\n前往「发现 > 看一看」看更多朋友\n\n轻触阅读原文\n\n上海发布\n\n向上滑动看下一个\n\n微信扫一扫使用小程序\n\n× 分析\n\n： ， ， ， ， ， ， ， ， ， ， ， ， 。 视频 小程序 赞 ，轻点两下取消赞 在看 ，轻点两下取消在看 分享 留言 收藏\n\n,\n\n关闭\n\n**选择留言身份**\n\n更多\n\n**上海发布**\n\n【名镇】如果可以，在濮院镇见一面吧！",
+    "text_link_map": {}
+}
--- a/test/webpage_samples/df9e89/df9e89.json
+++ b/test/webpage_samples/df9e89/df9e89.json
--- a/test/webpage_samples/df9e89/sample.json
+++ b/test/webpage_samples/df9e89/sample.json
@ -0,0 +1,128 @@
+{
+    "link_dict": {
+        "https://news.bjx.com.cn/zhuanti/2019zhnyfw/": "综合能源服务 综合能源服务",
+        "https://fdjob.bjx.com.cn/industry/flfd/": "风电场投资及运营",
+        "https://fdjob.bjx.com.cn/industry/fdzj/": "风电整机",
+        "https://fdjob.bjx.com.cn/industry/ypjcl/": "叶片及材料",
+        "https://fdjob.bjx.com.cn/industry/fedgc/": "风电工程",
+        "https://fdjob.bjx.com.cn/industry/fdjjyj/": "发电机及元件",
+        "https://fdjob.bjx.com.cn/industry/tttj/": "塔筒/塔架",
+        "https://fdjob.bjx.com.cn/industry/kzxt/": "控制系统",
+        "https://gfjob.bjx.com.cn/industry/gfdcjzj/": "光伏电池组件",
+        "https://gfjob.bjx.com.cn/industry/gfdz/": "光伏电站及运维",
+        "https://gfjob.bjx.com.cn/industry/gfnbq/": "光伏逆变器",
+        "https://gfjob.bjx.com.cn/industry/fbsgf/": "分布式光伏",
+        "https://gfjob.bjx.com.cn/industry/gsygf/": "工商业光伏",
+        "https://gfjob.bjx.com.cn/industry/hygf/": "户用光伏",
+        "https://gfjob.bjx.com.cn/industry/gfgc/": "光伏系统工程",
+        "https://gfjob.bjx.com.cn/industry/gfxtlbj/": "光伏零部件",
+        "https://gfjob.bjx.com.cn/industry/gfcljsb/": "光伏原材料及辅料",
+        "https://cnjob.bjx.com.cn/industry/cndc/": "储能电池",
+        "https://cnjob.bjx.com.cn/industry/cnjs/": "储能系统",
+        "https://cnjob.bjx.com.cn/industry/cndz/": "储能电站",
+        "https://cnjob.bjx.com.cn/industry/gsycn/": "工商业储能",
+        "https://cnjob.bjx.com.cn/industry/cngc/": "储能工程",
+        "https://cnjob.bjx.com.cn/industry/dccl/": "储能材料及零部件",
+        "https://cnjob.bjx.com.cn/industry/xnyqc/": "光储充一体化",
+        "https://cnjob.bjx.com.cn/industry/cnyy/": "户用储能",
+        "https://tv.bjx.com.cn/list/type=2": "公开课",
+        "https://tv.bjx.com.cn/list/type=3": "空中宣讲",
+        "https://tv.bjx.com.cn/list/type=4": "云招聘",
+        "https://tv.bjx.com.cn/list/type=5": "星培计划",
+        "https://tv.bjx.com.cn/list/type=7": "高端访谈",
+        "https://tv.bjx.com.cn/list/type=8": "校企专访",
+        "https://edu.bjx.com.cn/activity/": "线下活动",
+        "https://news.bjx.com.cn/zt/app/index.html": "学社APP",
+        "https://huanbao.bjx.com.cn/ex": "环保会展网",
+        "https://guangfu.bjx.com.cn/ex/": "光伏会展网",
+        "https://www.bjx.com.cn/zt/": "电力专题",
+        "https://huanbao.bjx.com.cn/zt/": "环保专题",
+        "https://news.bjx.com.cn/zb/": "招投标",
+        "https://pubapinews.bjx.com.cn/kehuclickid=18438": "讲师招募！寻找光伏、风电、储能、环保行业讲师",
+        "https://news.bjx.com.cn/html/20241227/1419508.shtml": "又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付 又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付!** 近日，中国船舶712所自主研制的200千瓦级船用氢燃料电池系统正式装车发货，该系统从核心材料到组件均为独立研发，拥有完整的自主知识产权，并获得中国船级社认证。该系统配备两台100千瓦氢燃料电池，采用高压储氢罐，总储氢量120千克，将为氢电拖轮提供高效电能，发电效率达55%。与传统的电力拖轮相比 _氢燃料电池_ _氢能船舶_ _储氢 又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付 又一力作！中国船舶712所200千瓦级船用氢燃料电池系统交付",
+        "https://news.bjx.com.cn/topics/luqing/": "查看更多> **绿氢****氢能****绿氢耦合",
+        "https://news.bjx.com.cn/topics/qingneng/": "查看更多> **绿氢****氢能****绿氢耦合",
+        "https://news.bjx.com.cn/topics/luqingouhe/": "查看更多> **绿氢****氢能****绿氢耦合",
+        "https://news.bjx.com.cn/html/20241227/1419599.shtml": "全球最大体量绿色氢氨醇一体化建设项目2.87亿贷款落地！** 近日，北极星氢能网获悉，进出口银行吉林省分行顺利实现吉林省绿色氢氨醇一体化项目首笔放款2.87亿元。此前，该项目成功获得国家超长期特别国债资金支持，据了解，该项目是目前全球最大体量的绿色氢氨醇一体化项目，也是国家发改委“首批绿色低碳先进技术示范项目”，总投资高达296亿元，于2023年9月26 _绿氢_ _绿氨_ _氢氨醇一体化 全球最大体量绿色氢氨醇一体化建设项目2.87亿贷款落地！ 全球最大体量绿色氢氨醇一体化建设项目2.87亿贷款落地！",
+        "https://news.bjx.com.cn/html/20241227/1419535.shtml": "冲刺一百天 跑出加速度 | 上海电气获国内首个绿色甲醇全流程ISCC EU认证，氢基能源发展按下「快进键」** 12月23日，上海电气同时获得生物质收储、生物质处理及生物质气化耦合绿氢制生物甲醇ISCCEU认证，成为国内首个取得从生物质田间收储到绿色甲醇生产全流程ISCCEU认证的绿色甲醇供应商。此次认证对象为上海电气旗下绿源科技（吉林）有限公司独资建设的洮南市风电耦合生物质绿色甲醇一体化示范项目。该项目 _绿氢_ _绿色甲醇_ _上海电气 冲刺一百天 跑出加速度 | 上海电气获国内首个绿色甲醇全流程ISCC EU认证，氢基能源发展按下「快进键」 冲刺一百天 跑出加速度 | 上海电气获国内首个绿色甲醇全流程ISCC EU认证，氢基能源发展按下「快进键」 冲刺一百天 跑出加速度 | 上海电气获国内首个绿色甲醇全流程ISCC EU认证，氢基能源发展按下「快进键」",
+        "https://news.bjx.com.cn/html/20241226/1419474.shtml": "海外+1！双良集团拿下印度ACME阿曼日产300吨绿氨项目！** 近日，双良新能源与印度ACME集团在国际清洁能源合作领域迈出了坚实的一步。近日，经过前期多轮深入交流与实地考察，双方正式签署了阿曼绿氢绿氨项目的绿电制氢系统订单协议，双方在全球清洁能源产业中的合作将开启新篇章，共同致力于应对全球气候变化的挑战。ACME集团成立于2003年，如今已发展成为印度 _绿氢_ _绿氨_ _电解水制氢 海外+1！双良集团拿下印度ACME阿曼日产300吨绿氨项目！",
+        "https://news.bjx.com.cn/html/20241226/1419473.shtml": "内蒙古达茂旗政府与国富氢能签约！** 12月24日，达茂旗人民政府与内蒙古蒙发国富氢能科技有限公司举行座谈会并签署合作框架协议。达茂旗旗委副书记、旗长黄龙，副旗长刘熙臣，零碳园区管委会主任孙日清，蒙发能源控股集团总裁、内蒙古蒙发国富氢能科技有限公司董事长高三雄，江苏国富氢能技术装备股份有限公司常务副总裁丁镭哲，内蒙古华电 _绿氢_ _氢能产业_ _国富氢能 内蒙古达茂旗政府与国富氢能签约！** 12月24日，达茂旗人民政府与内蒙古蒙发国富氢能科技有限公司举行座谈会并签署合作框架协议。达茂旗旗委副书记、旗长黄龙，副旗长刘熙臣，零碳园区管委会主任孙日清，蒙发能源控股集团总裁、内蒙古蒙发国富氢能科技有限公司董事长高三雄，江苏国富氢能技术装备股份有限公司常务副总裁丁镭哲，内蒙古华电 _绿氢_ _氢能产业_ _国富氢能",
+        "https://news.bjx.com.cn/html/20241226/1419244.shtml": "重大进展！全球首台30MW级纯氢燃气轮机点火成功！** 12月22日，全球首台30MW级纯氢燃气轮机“木星一号”整机试验首次点火成功。此次点火的纯氢燃气轮机是目前全球单机功率最大的纯氢发电机组，纯氢点火试验取得预期效果，验证了纯氢燃气轮机系统可靠性和安全性，标志着我国大功率氢燃气轮机和氢储能技术取得重大进展，新型长时储能有了新技术、新产品。“ _绿氢_ _氢燃气轮机_ _电解水制氢",
+        "https://news.bjx.com.cn/html/20241225/1419113.shtml": "大安风光制绿氢合成氨一体化项目** 一、企业基本情况大安吉电绿氢能源有限公司，2021年10月27日成立，注册地为吉林省白城市大安市两家子镇吉林西部（大安）清洁能源化工产业园，公司注册资本11.9亿元，是吉电股份的全资子公司，主要承载吉电股份吉林西部千万千瓦新能源制氢基地开发建设任务。二、项目基本情况大安风光制绿氢合成氨一体化 _绿氢_ _绿氨_ _氢氨一体化",
+        "https://news.bjx.com.cn/html/20241225/1419078.shtml": "苏伊士运河经济区已吸引640亿美元的绿氢投资！** 近日，苏伊士运河经济区通过12项框架协议吸引了640亿美元的绿氢投资。12个最终确定的框架协议预计每年生产1800万吨绿氢。除了绿色氢能外，经济区还提供其他重要投资机会，包括电动汽车行业的电池与轮胎制造、制药、建筑材料及纺织品生产等领域。 _绿氢_ _氢能_ _电解水制氢",
+        "https://news.bjx.com.cn/html/20241225/1419033.shtml": "家氢能公司成立！** 近日，西安氢碳绿能能源科技有限公司与富氢储氢（福建）科技研究院有限公司相继成立！西安氢碳绿能能源科技有限公司成立于2024年12月23日，注册资本1000万元，法定代表人为潘珂郡。经营范围包括一般项目：温室气体排放控制技术研发；资源循环利用服务技术咨询；信息技术咨询服务；气体、液体分离及纯净 _储氢_ _氢能_ _绿氢",
+        "https://news.bjx.com.cn/html/20241225/1418949.shtml": "隆基氢能与HydrogenPro签订合作协议，携手开拓欧洲绿氢市场！** 12月23日，隆基氢能与HydrogenProASA（HydrogenPro）签署了一项投资协议。HydrogenPro是绿氢技术及系统领域的领军企业，隆基氢能将与HydrogenPro现有股东安德里茨集团（ANDRITZAG）以及三菱重工业株式会社（MitsubishiHeavyIndustries,Ltd）一起，向HydrogenPro投资约1.4亿挪威克朗。与此同时，双方还 _绿氢_ _电解槽_ _隆基氢能",
+        "https://news.bjx.com.cn/html/20241224/1418802.shtml": "内蒙古兴安盟金风绿氢制50万吨绿色甲醇项目最新进展** 北极星氢能网获悉，近日，中国化学工程第四建设有限公司江西公司承建的兴安盟金风绿氢制50万吨绿色甲醇项目（一期）公辅工程（EPC）项目锅炉实现点火一次成功。该项目位于内蒙古自治区兴安盟经济技术开发区，是全国首个大规模风电绿甲醇项目，创新融合“风—氢—储—生物质”多能协同技术，高效利用波 _绿色甲醇_ _氢能_ _内蒙古氢能",
+        "https://news.bjx.com.cn/html/20241227/1419674.shtml": "羚牛氢能与泰达控股签订战略合作协议 共建“天津氢港”！** 12月25日，羚牛氢能与天津泰达投资控股有限公司（简称“泰达控股”）携手合作，共同签署了战略合作协议。双方将在资源共享、优势互补的基础上，探索多领域合作机会，共同推动氢能商用车的广泛应用，为构建零碳物流生态体系注入强劲动力。泰达控股旗下的中国(北方)商用车产业园项目毗邻滨海新区核心区仅 _氢能汽车_ _氢燃料电池_ _羚牛氢能 羚牛氢能与泰达控股签订战略合作协议 共建“天津氢港”！ 羚牛氢能与泰达控股签订战略合作协议 共建“天津氢港”！ 羚牛氢能与泰达控股签订战略合作协议 共建“天津氢港”！",
+        "https://news.bjx.com.cn/html/20241227/1419671.shtml": "中国石化资本投资入股徐工汽车！** 近日，中国石化资本投资入股徐州徐工汽车制造有限公司（简称“徐工汽车”），混改项目正式完成。中国石化资本投资入股徐工汽车，将进一步推动徐工集团和中国石化在氢能交通、润滑油、充换电、销售网络及后市场服务等领域的务实合作，协同构建新能源产业链“闭环式”生态圈，促进我国绿色低碳产业高质量 _氢能交通_ _氢能汽车_ _中国石化 中国石化资本投资入股徐工汽车！ 中国石化资本投资入股徐工汽车！ 中国石化资本投资入股徐工汽车！",
+        "https://news.bjx.com.cn/html/20241227/1419616.shtml": "辆氢能重卡三期采购项目中标候选人公示！** 12月25日，氢能产业示范区氢能重卡三期采购项目中标候选人公示，第一中标候选人：天津临港海洋经济投资发展集团有限公司，投标报价2150万。项目规模：本次招标计划采购氢燃料电池49T半挂牵引车40辆。共分为2个包，每个包20辆。具体内容详见招标文件和用户需求书。交货期：签订合同后45个工作日交货。 _氢燃料电池_ _氢能重卡_ _氢能 辆氢能重卡三期采购项目中标候选人公示！ 辆氢能重卡三期采购项目中标候选人公示！",
+        "https://news.bjx.com.cn/html/20241227/1419610.shtml": "中石化海水电解制氢中试装置招标！** 12月26日，中石化（天津）石油化工有限公司海水电解制氢中试装置/316+20#10kg/h招标。 _电解水制氢_ _氢能_ _中石化 中石化海水电解制氢中试装置招标！ 中石化海水电解制氢中试装置招标！",
+        "https://news.bjx.com.cn/html/20241227/1419541.shtml": "中能建氢能公司改革新步伐收录国资委《千帆竞渡》！** 由国务院国资委组织编写的《千帆竞渡：基层国有企业改革深化提升行动案例集（上、中、下）》已于近日面向全国发行。中能建氢能源有限公司作为引领国内氢能领域革新探索的典范之一，提炼实践经验，汇集成改革案例《系统实施深化改革，打造氢能领域行业领军企业》，被成功收录其中，鲜明地体现了基层国有 _氢能_ _氢能产业_ _中能建 中能建氢能公司改革新步伐收录国资委《千帆竞渡》！ 中能建氢能公司改革新步伐收录国资委《千帆竞渡》！",
+        "https://news.bjx.com.cn/html/20241226/1419475.shtml": "氢燃料电池商用车整车项目签约落户西安！** 近日，北极星氢能网获悉，氢燃料电池商用车整车项目落户西安。该项目由陕西秦深盈创氢能科技有限公司投资建设，总投资约2亿元，建成达产后预计可实现年产值10亿元，引进配套供应商10个以上。该项目主要聚焦氢燃料电池商用车整车研发、生产、销售。生产的车辆还将采用国内领先的动力配置，实现双极板、 _氢燃料电池汽车_ _氢能产业_ _氢商用车",
+        "https://news.bjx.com.cn/html/20241226/1419471.shtml": "辆氢能源环卫作业车辆租赁项目中标结果公示！** 12月25日，郑州市金水区城市管理局金水区氢能源环卫作业车辆租赁项目中标结果公示，中标人郑州金水建设投资有限公司，中标金额79214400.00元。采购内容：为积极响应国家节能减排政策号召，加快推进金水区氢能源产业的建设，郑州市金水区城市管理局计划对传统柴油环卫作业车辆逐步进行更新升级，先行采 _氢能环卫_ _氢能汽车_ _氢燃料电池",
+        "https://news.bjx.com.cn/html/20241226/1419453.shtml": "阳光氢能开启电解槽“数智”蝶变** 北极星氢能网获悉，12月26日，阳光氢能举办数智化大型电解槽新品发布会，氢能上下游企业、协会、媒体等专家、领导齐聚一堂，与数万观众通过线上直播，共同见证了首台“数智化”电解槽的隆重发布。在数字能源转型与氢能产业快速发展的当下，电解槽的“数智”蝶变，将大力助推绿氢规模化发展和多元场景的 _电解槽_ _电解水制氢_ _阳光氢能",
+        "https://news.bjx.com.cn/html/20241223/1418467.shtml": "绿氢100万标方，宁煤完成绿氢制绿氨示范！** 近日，宁夏煤业烯烃二公司累计引入宁东可再生氢碳减排示范区一期项目清水营制氢厂绿氢100万标方，成功实现了煤化工行业可再生氢耦合示范，填补我国光伏制可再生氢耦合煤制合成氨系统运营的空白，进一步推动了宁夏煤业公司绿色发展。宁东可再生氢碳减排示范区一期项目是国家能源集团首个万方级光伏制可 _绿氢_ _可再生氢耦合_ _光伏制氢",
+        "https://news.bjx.com.cn/html/20241218/1417617.shtml": "内蒙古宝丰煤基绿氢与煤化工耦合碳减排创新示范项目开车投产！** 近日，内蒙古宝丰煤基绿氢与煤化工耦合碳减排创新示范项目首系列各装置顺利开车投产，标志着该项目正式进入试生产阶段。该项目建设年产300万吨烯烃，是目前全球单厂规模最大的煤制烯烃项目，也是全球唯一一个规模化用绿氢替代化石能源生产烯烃的项目，广泛应用了多项具有我国自主知识产权、全球现代煤 _绿氢_ _绿氢项目_ _氢能",
+        "https://news.bjx.com.cn/html/20241217/1417357.shtml": "氧化碳耦合绿氢制甲醇 吉利创新中心牵手世界500强** 近日，吉利创新中心与世界500强企业霍尼韦尔签署战略合作协议，双方将基于各自优势，在二氧化碳捕集、捕集二氧化碳耦合绿氢制取电子甲醇、电子甲醇制取可持续航空燃料SAF技术合作以及其他服务于高质量低碳可持续发展的先进技术等多个领域达成了合作意向。用绿色甲醇破解世界能源和双碳难题，吉利创新中 _绿氢_ _绿色甲醇_ _绿色能源",
+        "https://news.bjx.com.cn/html/20240826/1396924.shtml": "国家电网巴西电力签约绿氢项目** 北极星氢能网获悉，8月22日中国国家电网巴西电力CPFL公司（以下简称国网巴电CPFL公司）与巴西米祖水泥制造公司在巴西东北部北里奥格兰德州纳塔尔市举行绿氢科技项目签约仪式，为当地能源转型注入新动力。根据签约内容，双方将联合投资建设包括氢能制取、储运、应用在内的全链条1兆瓦绿色制氢站，并就氢 _绿氢_ _氢电耦合_ _国家电网",
+        "https://news.bjx.com.cn/html/20240820/1395862.shtml": "年产1120万吨甲醇！宝丰又一个绿氢耦合煤化工项目启动** 7月30日，据国家级新疆准东经济技术开发区官网，新疆宝丰煤炭清洁高效转化耦合植入绿氢制低碳化学品和新材料示范项目进入环境影响评价公众参与第一次公示阶段。项目位于将军庙矿区5号化工产业园区，包括4×280万吨/年甲醇、4×100万吨/年甲醇制烯烃、4×110万吨/年烯烃分离、3×65万吨/年聚丙烯、3×65 _绿氢_ _绿色甲醇_ _氢能",
+        "https://news.bjx.com.cn/html/20240708/1387671.shtml": "全球单厂规模最大绿氢耦合煤制烯烃项目变电站成功送电** 6月30日，内蒙古宝丰煤基新材料有限公司宝丰烯烃220千伏变电站成功送电，该变电站成功送电不仅为宝丰煤基新材料项目的调试投产提供了坚实可靠的电力保障，也为宝丰煤基新材料后续“风光融合”发绿电制取绿氢奠定了基础。当天上午10时15分，在内蒙古宝丰煤基新材料有限公司年产300万吨烯烃项目，随着主 _绿氢_ _甲醇_ _煤制烯烃",
+        "https://news.bjx.com.cn/html/20240703/1386708.shtml": "国家能源集团首例绿氢耦合煤化工项目建成中交** 北极星氢能网获悉，6月28日，由国家能源集团国华投资（氢能公司）宁夏分公司建设的宁东可再生氢碳减排示范区一期项目清水营制氢厂建成中交，该厂是国家能源集团首例绿氢耦合煤化工项目，所制氢气专供煤化工项目作为原料补充氢气，实现绿氢与煤化工产业耦合示范应用。清水营制氢厂是国家能源集团首例“ _绿氢_ _制氢_ _国家能源集团",
+        "https://news.bjx.com.cn/html/20240423/1372981.shtml": "加强电氢耦合标准化建设 推动绿氢产业健康可持续发展** 今年的政府工作报告提出“强化能源资源安全保障”“加快建设新型能源体系”。氢作为一种绿色低碳、来源丰富、应用广泛的能源形式，既是重要的化工原料又是优质的能源载体，将在新型能源体系中承担重要角色。“双碳”目标要求下，新能源电解制氢（绿氢）是未来氢能发展的核心方向。新能源电解制氢规模的 _电氢耦合_ _新型电力系统_ _氢燃料电池技",
+        "https://news.bjx.com.cn/html/20240422/1372893.shtml": "苏州希倍优氢能源科技有限公司在离网风光耦合制绿氢研究取得重要进展** 光伏与风电是一种无碳排放的可再生能源，但是其能量具有波动性的特征，对电网系统的安全运行冲击较大，国内很多地区可供并网的风电、光伏装机容量已达上限。使用光伏、风能进行耦合离网电解水制氢，不但可以解决光伏、风电的消纳难题，还有助于国家在工业、交通运输及民用领域深度脱碳，实现国家“碳中 _电解水制氢_ _电解槽_ _希倍优氢能",
+        "https://news.bjx.com.cn/html/20240327/1368442.shtml": "中国电科院康建东：电氢耦合技术战略及发展趋势** 北极星氢能网获悉，3月27日，中国电力科学研究院有限公司技术战略研究中心室主任康建东在绿氢技术与氢电协同发展专题论坛上发表了《电氢耦合技术战略及发展趋势》的主题演讲。以下为康建东演讲全文：各位嘉宾、各位同仁上午好。我是来自中国电力科学研究院的康建东，我今天演讲的题目是电氢耦合技术绽 _绿氢技术_ _新型电力系统_ _电氢耦合技术",
+        "https://pubapinews.bjx.com.cn/kehuclickid=22330": "§to_be_recognized_by_visual_llm_https://img01.mybjx.net/webupload/image/20241219/3e0f324b240000e.png§",
+        "https://pubapinews.bjx.com.cn/kehuclickid=21678": "§to_be_recognized_by_visual_llm_https://img01.mybjx.net/webupload/image/20240318/3e0984389c00004.png§",
+        "https://pubapinews.bjx.com.cn/kehuclickid=18782": "§to_be_recognized_by_visual_llm_http://img01.mybjx.net/webupload/image/20220402/3ffac4d5dc00002.jpg§",
+        "https://news.bjx.com.cn/rankinglist/qingneng/": "更多> **新闻排行榜** _今日_ _本周_ _本月",
+        "https://news.bjx.com.cn/html/20241227/1419591.shtml": "华电取得首个绿氨掺烧项目！ 华电取得首个绿氨掺烧项目！ 华电取得首个绿氨掺烧项目！",
+        "https://news.bjx.com.cn/html/20241227/1419727.shtml": "绿色燃料，未来已来！ 绿色燃料，未来已来！ 绿色燃料，未来已来！",
+        "https://news.bjx.com.cn/html/20241227/1419693.shtml": "考克利尔获210MW电解槽订单，支持美国12万吨电子甲醇项目！ 考克利尔获210MW电解槽订单，支持美国12万吨电子甲醇项目！ 考克利尔获210MW电解槽订单，支持美国12万吨电子甲醇项目！",
+        "https://dljob.bjx.com.cn/specials/101813.html": "届秋招第二批次补录岗位首发 届秋招第二批次补录岗位首发",
+        "https://dljob.bjx.com.cn/specials/101817.html": "新一线城市25届校招专场 新一线城市25届校招专场",
+        "https://dljob.bjx.com.cn/specials/101814.html": "研后大型专场招聘会",
+        "https://hdjob.bjx.com.cn/specials/101816.html": "薪跳加速，职此进阶！火电行业集控岗位招聘专场",
+        "https://fdjob.bjx.com.cn/specials/101815.html": "“薪”潮澎湃，虚位以待！ 河北地区 风电行业运维岗招聘专场",
+        "https://hr.bjx.com.cn/companys/9946/jobs_0_1/": "§to_be_recognized_by_visual_llm_https://hrusersidestatic.bjx.com.cn/enterprisenew/companylogo/9946/2024120509430700_872717.jpeg§",
+        "https://hr.bjx.com.cn/companys/24264/jobs_0_1/": "§to_be_recognized_by_visual_llm_https://static.bjx.com.cn/company-logo/2022/08/10/2022081012220780_img505306.png§",
+        "https://hr.bjx.com.cn/companys/29747/jobs_0_1/": "§to_be_recognized_by_visual_llm_https://static.bjx.com.cn/enterprisenew/companylogo/29747/2023042010472011_911364.jpeg§",
+        "https://hr.bjx.com.cn/companys/73962/jobs_0_1/": "§to_be_recognized_by_visual_llm_https://static.bjx.com.cn/enterprisenew/companylogo/58993/2020110317483297_984570.jpeg§",
+        "https://hr.bjx.com.cn/companys/104157/jobs_0_1/": "§to_be_recognized_by_visual_llm_https://static.bjx.com.cn/enterprisenew/companylogo/104157/2021022414564907_384017.jpeg§",
+        "https://edu.bjx.com.cn/s/67109308.html": "课程集中式光伏项目全流程支持性文件概述",
+        "https://edu.bjx.com.cn/s/67109294.html": "课程储能电站安全运维管理方案",
+        "https://edu.bjx.com.cn/s/67109353.html": "课程锂离子电池储能中的安全问题及应对技术（限时免费）",
+        "https://edu.bjx.com.cn/s/67109396.html": "课程光伏电站股权收购的流程和风险分析",
+        "https://edu.bjx.com.cn/s/67109299.html": "课程《零碳生态&绿色未来》系列公益公开课",
+        "https://edu.bjx.com.cn/s/67109355.html": "课程分布式光伏电站建设“避坑指南”",
+        "https://news.bjx.com.cn/html/20241227/1419527.shtml": "中国能建华北院中标鄂尔多斯市绿氢与煤化工耦合碳减排创新示范项目！",
+        "http://www.bjx.com.cn/about/about.html": "关于北极星",
+        "http://www.bjx.com.cn/about/baojia_index.html": "广告服务",
+        "https://companycenter.bjx.com.cn/common/hybz": "会员服务",
+        "http://www.bjx.com.cn/about/yxfa.html": "营销方案",
+        "http://www.bjx.com.cn/about/about.html=": "成功案例",
+        "http://www.bjx.com.cn/about/lxwm.html": "联系我们",
+        "http://www.beian.gov.cn/portal/registersysteminforecordcode=11010502034458": "京公网安备11010502034458号",
+        "https://www.bjx.com.cn/icp.html": "电子公告服务专项备案"
+    },
+    "text": "新闻\n\n招聘\n\n直播\n\n学社\n\n商务通\n\n社区\n\n会展\n\n专题\n\n搜索历史清空\n\n1. 企业注册[Ref_1]\n  2. 企业登录[Ref_2]\n  3. 个人注册\n  4. 个人登录\n\n您的位置： _电力[Ref_3]_ _氢能[Ref_4]_ _氢能综合利用[Ref_5]_ _其他[Ref_6]_ _企业[Ref_7]_ _正文_\n\n# 中国能建华北院中标鄂尔多斯市绿氢与煤化工耦合碳减排创新示范项目！\n\n2024-12-27 08:56来源：中国能建关键词： _绿氢[Ref_8]_ _氢能[Ref_9]_ _绿氢耦合[Ref_10]_ 收藏点赞\n\n分享\n\n订阅\n\n投稿 _我要投稿_\n\n中国能建华北院中标内蒙古宝丰煤基新材料有限公司源网荷储一体化工程设计项目。\n\n项目位于乌审旗苏里格经济开发区，是鄂尔多斯市绿氢[Ref_8]与煤化工耦合碳减排创新示范项目。项目依托乌审旗丰富的煤炭资源、土地资源、可再生风光资源、矿井疏干水资源和环境承载力空间，通过配套独立建设100万千瓦新能源（其中，风电50万千瓦、光伏50万千瓦），20万千瓦/60万千瓦时储能，为260万吨/年煤制烯烃及配套40万吨/年植入绿氢耦合[Ref_10]制烯烃供电。\n\n项目建成后，将实现绿电、绿氧、绿氢耦合减排并减少外购甲醇量稳定烯烃产能，最终实现降碳减碳目标。\n\n投稿与新闻线索：陈女士 微信/手机：13693626116 邮箱：chenchen#bjxmail.com（请将#改成@）\n\n订阅北极星周刊，精彩内容不再错过！\n\n_*_ 姓名\n\n_*_ 手机\n\n**发送验证码**\n\n_*_ 邮箱\n\n公司\n\n岗位\n\n订阅\n\n特别声明：北极星转载其他网站内容，出于传递更多信息而非盈利之目的，同时并不代表赞成其观点或证实其描述，内容仅供参考。版权归原作者所有，若有侵权，请联系我们删除。\n\n凡来源注明北极星*网的内容为北极星原创，转载需获授权。\n\n**阅读下一篇**\n\n安徽新力电业高技术有限责任公司在招副值/集控副值（垃圾发电）、值长、主值、副值等 _16_ 个职位[Ref_13]绩效奖金 | 带薪年假 | 交通补助\n\n信息产业电子第十一设计研究院科技工程股份有限公司华东分院在招光伏电站运维站长-祁东、光伏电站运维人员-青海海南州等 _30_ 个职位[Ref_14]午餐补助 | 绩效奖金 | 五险一金\n\n新疆天池能源有限责任公司在招20亿立方煤制气电解水车间主任、安全专工、设备专工、班长、20亿立方煤制气项目动力项目部工艺专工等 _27_ 个职位[Ref_15]定期体检 | 交通补助 | 绩效奖金\n\n光大环保能源（衡南）有限公司在招会计、等 _1_ 个职位[Ref_16]带薪年假 | 通讯津贴 | 定期体检\n\n阳光智维科技股份有限公司在招浙江光伏运维值班员、湖北荆州光伏运维值长等 _538_ 个职位[Ref_17]绩效奖金 | 通讯津贴 | 节日礼物\n\n**最新热点****企业风采**\n\n## 登录注册\n\n**继续扫码**\n\n**二维码已失效** 点击刷新\n\n请使用微信扫一扫\n\n关注公众号完成登录\n\n## 绑定账号\n\n_+86_\n\n发送验证码\n\n绑定\n\n## 想要获取更精准资讯推荐？建议您完善以下信息~\n\n姓名：  \n---  \n性别：  \n出生日期：  \n邮箱：  \n所在地区：  \n行业类别：  \n工作经验：  \n学历：  \n公司名称：  \n任职岗位：  \n  \n提交\n\n## 订阅成功\n\n我们将会第一时间为您推送相关内容！\n\n*  _扫码下载APP_\n  *  _扫码关注公众号_\n\n便捷入口\n\n关于我们\n\n网站运营\n\n北京火山动力网络技术有限公司\n\n北京市朝阳区世通国际大厦C座12层\n\n广告合作：崔女士 18911066791\n\n陈女士 17701031159\n\n合作投稿：陈女士 13693626116\n\n会展合作：齐女士 13381061157\n\n会员咨询：李先生 17718308761\n\n法务邮箱：fw@bjxmail.com\n\nCopyright © 2024 Bjx.com.cn All Rights Reserved. 北京火山动力网络技术有限公司[Ref_18] 版权所有",
+    "text_link_map": {
+        "Ref_1": "https://companycenter.bjx.com.cn/account/register/",
+        "Ref_2": "https://companycenter.bjx.com.cn/account/logon/",
+        "Ref_3": "https://news.bjx.com.cn/",
+        "Ref_4": "https://qn.bjx.com.cn/",
+        "Ref_5": "https://qn.bjx.com.cn/qmzhly/",
+        "Ref_6": "https://qn.bjx.com.cn/qnzhlyqt/",
+        "Ref_7": "https://qn.bjx.com.cn/mq/",
+        "Ref_8": "https://news.bjx.com.cn/topics/luqing/",
+        "Ref_9": "https://news.bjx.com.cn/topics/qingneng/",
+        "Ref_10": "https://news.bjx.com.cn/topics/luqingouhe/",
+        "Ref_11": "https://news.bjx.com.cn/topics/luqing/",
+        "Ref_12": "https://news.bjx.com.cn/topics/luqingouhe/",
+        "Ref_13": "https://hr.bjx.com.cn/companys/9946/jobs_0_1/",
+        "Ref_14": "https://hr.bjx.com.cn/companys/24264/jobs_0_1/",
+        "Ref_15": "https://hr.bjx.com.cn/companys/29747/jobs_0_1/",
+        "Ref_16": "https://hr.bjx.com.cn/companys/73962/jobs_0_1/",
+        "Ref_17": "https://hr.bjx.com.cn/companys/104157/jobs_0_1/",
+        "Ref_18": "http://img.mybjx.net/theme/default/images/common/renzeng/yingyezhizheng.jpg"
+    }
+}