From 77c3914d127aac13c90103f11c7a499286aef34c Mon Sep 17 00:00:00 2001 From: bigbrother666sh Date: Thu, 16 Jan 2025 10:56:57 +0800 Subject: [PATCH] method to seperate links area from content --- core/llms/openai_wrapper.py | 2 +- core/scrapers/deep_scraper.py | 108 +++++++++++++------- core/scrapers/mp_scraper.py | 24 ++++- test/deep_scraper_test.py | 18 +--- test/get_info_test.py | 185 +++++++--------------------------- test/prompts.py | 20 ++-- 6 files changed, 143 insertions(+), 214 deletions(-) diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py index c632c1e..4b40cf5 100644 --- a/core/llms/openai_wrapper.py +++ b/core/llms/openai_wrapper.py @@ -54,7 +54,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: finally: semaphore.release() - if logger: + if logger and resp: logger.debug(f'result:\n {response.choices[0]}') logger.debug(f'usage:\n {response.usage}') return resp diff --git a/core/scrapers/deep_scraper.py b/core/scrapers/deep_scraper.py index b9135b6..a55bac7 100644 --- a/core/scrapers/deep_scraper.py +++ b/core/scrapers/deep_scraper.py @@ -49,34 +49,34 @@ def normalize_url(url: str, base_url: str) -> str: return _ss[0] + '//' + '/'.join(_ss[1:]) -def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], dict]: +def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]: link_dict = {} to_be_recognized_by_visual_llm = {} - def check_url_text(text): - # text = text.strip() - # for special url formate from crawl4ai 0.4.247 - text = re.sub(r'', '', text).strip() + # for special url formate from crawl4ai 0.4.247 + raw_markdown = re.sub(r'', '', raw_markdown).strip() - # 处理图片标记 ![alt](src) - img_pattern = r'(!\[(.*?)\]\((.*?)\))' - matches = re.findall(img_pattern, text) - for _sec,alt, src in matches: - # 替换为新格式 §alt||src§ - text = text.replace(_sec, f'§{alt}||{src}§', 1) - + # 处理图片标记 ![alt](src) + i_pattern = r'(!\[(.*?)\]\((.*?)\))' + matches = re.findall(i_pattern, raw_markdown, re.DOTALL) + for _sec, alt, src in matches: + # 替换为新格式 §alt||src§ + raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1) + + def check_url_text(text) -> tuple[int, str]: + score = 0 + _valid_len = len(text.strip()) # 找到所有[part0](part1)格式的片段 link_pattern = r'(\[(.*?)\]\((.*?)\))' - matches = re.findall(link_pattern, text) + matches = re.findall(link_pattern, text, re.DOTALL) for _sec, link_text, link_url in matches: - print("found link sec:", _sec) # 处理 \"***\" 格式的片段 quote_pattern = r'\"(.*?)\"' # 提取所有引号包裹的内容 - _title = ''.join(re.findall(quote_pattern, link_url)) + _title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL)) # 分离§§内的内容和后面的内容 img_marker_pattern = r'§(.*?)\|\|(.*?)§' - inner_matches = re.findall(img_marker_pattern, link_text) + inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL) for alt, src in inner_matches: link_text = link_text.replace(f'§{alt}||{src}§', '') link_text = link_text.strip() @@ -113,20 +113,21 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple link_text = img_alt real_url_pattern = r'<(.*?)>' - real_url = re.search(real_url_pattern, link_url) + real_url = re.search(real_url_pattern, link_url, re.DOTALL) if real_url: _url = real_url.group(1).strip() else: - _url = re.sub(quote_pattern, '', link_url).strip() + _url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip() if not _url or _url.startswith(('#', 'javascript:')): text = text.replace(_sec, link_text, 1) continue + score += 1 + _valid_len = _valid_len - len(_sec) url = normalize_url(_url, base_url) _key = f"[{len(link_dict)+1}]" link_dict[_key] = url text = text.replace(_sec, link_text + _key, 1) - # 检查链接是否是常见文件类型或顶级域名 # todo: 最后提取是否添加到 more_link时或者主流程时再处理 """ @@ -137,17 +138,17 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple """ # 处理文本中的其他图片标记 img_pattern = r'(§(.*?)\|\|(.*?)§)' - matches = re.findall(img_pattern, text) - remained_text = re.sub(img_pattern, '', text).strip() - remained_text_len = len(remained_text ) + matches = re.findall(img_pattern, text, re.DOTALL) + remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip() + remained_text_len = len(remained_text) for _sec, alt, src in matches: - if not src or src.startswith('#'): + if not src or src.startswith('#') or src not in used_img: text = text.replace(_sec, alt, 1) continue img_src = normalize_url(src, base_url) if not img_src: text = text.replace(_sec, alt, 1) - elif src not in used_img or remained_text_len > 5 or len(alt) > 2: + elif remained_text_len > 5 or len(alt) > 2: _key = f"[img{len(link_dict)+1}]" link_dict[_key] = img_src text = text.replace(_sec, alt + _key, 1) @@ -165,7 +166,6 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple _key = f"[img{len(link_dict)+1}]" link_dict[_key] = img_src text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1) - # 处理文本中的"野 url" url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])' matches = re.findall(url_pattern, text) @@ -174,22 +174,52 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple _key = f"[{len(link_dict)+1}]" link_dict[_key] = url text = text.replace(url, _key, 1) + score += 1 + _valid_len = _valid_len - len(url) + # 统计换行符数量 + newline_count = text.count(' * ') + score += newline_count + ratio = _valid_len/score if score != 0 else 999 - return text + return ratio, text sections = raw_markdown.split('# ') # use '# ' to avoid # in url - texts = [] - for i, section in enumerate(sections): - # filter the possible navigate section and footer section - section_remain = re.sub(r'\[.*?]\(.*?\)', '', section).strip() + if len(sections) > 2: + _sec = sections[0] + section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() section_remain_len = len(section_remain) - total_links = len(re.findall(r'\[.*?]\(.*?\)', section)) - print(f"section {i}") - print(f"ratio: {total_links/section_remain_len}") + total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL)) + ratio = total_links / section_remain_len if section_remain_len != 0 else 1 + if ratio > 0.05: + print('this is a navigation section, will be removed') + print(ratio) + print(section_remain) + print('-' * 50) + sections = sections[1:] + _sec = sections[-1] + section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip() + section_remain_len = len(section_remain) + if section_remain_len < 198: + print('this is a footer section, will be removed') + print(section_remain_len) + print(section_remain) + print('-' * 50) + sections = sections[:-1] - processed_p = [check_url_text(p) for p in section.split('\n\n')] - processed_p = [p for p in processed_p if p.strip()] - texts.append('\n\n'.join(processed_p)) - - return link_dict, texts, to_be_recognized_by_visual_llm - \ No newline at end of file + links_parts = [] + contents = [] + for section in sections: + ratio, text = check_url_text(section) + if ratio < 70: + print('this is a links part') + print(ratio) + print(text) + print('-' * 50) + links_parts.append(text) + else: + print('this is a content part') + print(ratio) + print(text) + print('-' * 50) + contents.append(text) + return link_dict, links_parts, contents diff --git a/core/scrapers/mp_scraper.py b/core/scrapers/mp_scraper.py index 8957a39..0dfff67 100644 --- a/core/scrapers/mp_scraper.py +++ b/core/scrapers/mp_scraper.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from bs4 import BeautifulSoup import re from crawl4ai import CrawlResult @@ -12,10 +14,21 @@ text_elements = { } -def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData: - url = fetch_result.url - raw_html = fetch_result.html - cleaned_html = fetch_result.cleaned_html +def mp_scraper(fetch_result: CrawlResult | dict) -> ScraperResultData: + if isinstance(fetch_result, dict): + url = fetch_result['url'] + raw_html = fetch_result['html'] + cleaned_html = fetch_result['cleaned_html'] + raw_markdown = fetch_result['markdown'] + media = fetch_result['media']['images'] + elif isinstance(fetch_result, CrawlResult): + url = fetch_result.url + raw_html = fetch_result.html + cleaned_html = fetch_result.cleaned_html + raw_markdown = fetch_result.markdown + media = fetch_result.media['images'] + else: + raise TypeError('fetch_result must be a CrawlResult or a dict') content = '' images = [] @@ -232,7 +245,8 @@ def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData: else: author = None publish_date = None - content = fetch_result['markdown'] + content = raw_markdown + images = [d['src'] for d in media] elif num_sub_divs >= 2: # 2.2 如果包含两个及以上子块 diff --git a/test/deep_scraper_test.py b/test/deep_scraper_test.py index 8a4435e..d7cb1f0 100644 --- a/test/deep_scraper_test.py +++ b/test/deep_scraper_test.py @@ -85,26 +85,18 @@ if __name__ == '__main__': for file in files: if not file.endswith('.json'): continue - #print(f"processing {file} ...") + print(f"processing {file} ...") try: with open(file, 'r') as f: html_sample = json.load(f) _url = html_sample['url'] if _url.startswith('https://mp.weixin.qq.com'): result = mp_scraper(html_sample) - #print(f'url: {result.url}') - #print(f'content: {result.content}') - #print(f'links: {result.links}') - #print(f'author: {result.author}') - #print(f'publish_date: {result.publish_date}') - #print(f'images: {len(result.images)}') - #for img in result.images: - # print(img) raw_markdown = result.content used_img = result.images else: raw_markdown = html_sample['markdown'] - used_img = {d['src']: d['alt'] for d in html_sample['media']['images']} + used_img = [d['src'] for d in html_sample['media']['images']] except Exception as e: print('sample format error, try to use craw4ai_fething.py to get sample') print(f"error: {e}") @@ -117,14 +109,14 @@ if __name__ == '__main__': base_url = base_url.rsplit('/', 1)[0] + '/' time_start = time.time() - link_dict, texts, to_be_recognized_by_visual_llm = deep_scraper(raw_markdown, base_url, used_img) + link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img) time_end = time.time() #print(f"time cost for html: {time_end - time_start}s") result = { "link_dict": link_dict, - "texts": texts, - "to_be_recognized_by_visual_llm": to_be_recognized_by_visual_llm, + "links_part": links_part, + "contents": contents, } record_folder = file.replace('.json', '') os.makedirs(record_folder, exist_ok=True) diff --git a/test/get_info_test.py b/test/get_info_test.py index f82d073..a3bec50 100644 --- a/test/get_info_test.py +++ b/test/get_info_test.py @@ -4,168 +4,62 @@ import json import asyncio import time from prompts import * -# prompt 要加上今天是………… +from datetime import datetime + current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) # get parent dir sys.path.append(project_root) from core.llms.openai_wrapper import openai_llm as llm -models = ['Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'Qwen/Qwen2.5-72B-Instruct'] +models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5'] -async def main(link_dict: dict, text: str, record_file: str, prompts: list, focus_points: list): +async def main(texts: list[str], record_file: str, sys_prompt: str, focus_points: list): # first get more links - _to_be_processed = [] - link_map = {} - for i, (url, des) in enumerate(link_dict.items()): - des = des.replace('\n', ' ') - _to_be_processed.append(f'//{des}//') - link_map[f' 2048: - content = f'\n{text_batch}\n\n{text_link_suffix}' + cache = [] + while _texts: + t = _texts.pop(0) + text_batch = f'{text_batch}{t}# ' + if len(text_batch) > 100 or len(_texts) == 0: + content = f'\n{text_batch}\n\n{get_info_suffix}' result = await llm( - [{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}], + [{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}], model=model, temperature=0.1) - print(f"llm output\n{result}") + #print(f"llm output\n{result}") text_batch = '' result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - result = result[-1] - for item in result.split('\n'): - if not item: - continue - segs = item.split('>') - if len(segs) != 2: - get_more_links_hallucination_times += 1 - continue - _index, focus = segs - _index = _index.strip() - focus = focus.strip().strip('//') - if focus == 'NA': - continue - if focus not in focus_points or _index not in link_map: - get_more_links_hallucination_times += 1 - continue - more_links.add(link_map[_index]) - - if text_batch: - content = f'\n{text_batch}\n\n{text_link_suffix}' - result = await llm( - [{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}], - model=model, temperature=0.1) - print(f"llm output\n{result}") - result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - result = result[-1] - for item in result.split('\n'): - if not item: - continue - segs = item.split('>') - if len(segs) != 2: - get_more_links_hallucination_times += 1 - continue - _index, focus = segs - _index = _index.strip() - focus = focus.strip().strip('//') - if focus == 'NA': - continue - if focus not in focus_points or _index not in link_map: - get_more_links_hallucination_times += 1 - continue - more_links.add(link_map[_index]) + if result: cache.append(result[-1]) - t1 = time.time() - get_more_links_time = t1 - start_time - print(f"get more links time: {get_more_links_time}") - - # second get more infos - lines = text.split('\n') - cache = set() - text_batch = '' - for line in lines: - text_batch = f'{text_batch}{line}\n' - if len(text_batch) > 5000: - #print(f"text_batch\n{text_batch}") - content = f'\n{text_batch}\n\n{text_info_suffix}' - result = await llm( - [{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}], - model=model, temperature=0.1) - print(f"llm output\n{result}") - text_batch = '' - result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - cache.add(result[-1]) - - if text_batch: - #print(f"text_batch\n{text_batch}") - content = f'\n{text_batch}\n\n{text_info_suffix}' - result = await llm( - [{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}], - model=model, temperature=0.1) - print(f"llm output\n{result}") - result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) - cache.add(result[-1]) - - get_infos_hallucination_times = 0 infos = [] for item in cache: segs = item.split('//') - i = 0 - while i < len(segs) - 1: - focus = segs[i].strip() - if not focus: - i += 1 - continue - if focus not in focus_points: - get_infos_hallucination_times += 1 - i += 1 - continue - content = segs[i+1].strip().strip('摘要').strip(':').strip(':') - i += 2 - if content and content != 'NA': - infos.append(f'{focus}: {content}') - """ - maybe can use embedding retrieval to judge - """ - t2 = time.time() - get_infos_time = t2 - t1 + infos.extend([s.strip() for s in segs if s.strip()]) + for content in infos: + if content not in judge_text: + print(f'not in raw content:\n{content}') + hallucination_times += 1 + + t1 = time.time() + get_infos_time = t1 - start_time print(f"get more infos time: {get_infos_time}") - - # get author and publish date from text - if len(text) > 1024: - usetext = f'{text[:500]}......{text[-500:]}' - else: - usetext = text - content = f'\n{usetext}\n\n\n{text_ap_suffix}' - llm_output = await llm([{'role': 'system', 'content': text_ap_system}, {'role': 'user', 'content': content}], - model=model, max_tokens=50, temperature=0.1) - print(f"llm output: {llm_output}") - ap_ = llm_output.strip().strip('"') - print("*" * 12) print('\n\n') - more_links_to_record = [f'{link_dict[link]}:{link}' for link in more_links] - more_links_to_record = '\n'.join(more_links_to_record) infos_to_record = '\n'.join(infos) with open(record_file, 'a') as f: f.write(f"llm model: {model}\n") - f.write(f"get more links time: {get_more_links_time} s\n") - f.write(f"bad generate times during get more links: {get_more_links_hallucination_times}\n") - f.write(f"get more infos time: {get_infos_time} s\n") - f.write(f"bad generate times during get more infos: {get_infos_hallucination_times}\n") - f.write(f"total more links: {len(more_links)}\n") - f.write(f"total infos: {len(infos)}\n") - f.write(f"author and publish time: {ap_}\n") - f.write(f"infos: \n{infos_to_record}\n") - f.write(f"more links: \n{more_links_to_record}\n") + f.write(f"process time: {get_infos_time} s\n") + f.write(f"bad generate times: {hallucination_times}\n") + f.write(f"total segments: {len(infos)}\n") + f.write(f"segments: \n{infos_to_record}\n") f.write("*" * 12) f.write('\n\n') @@ -190,9 +84,8 @@ if __name__ == '__main__': if expl: focus_statement = f"{focus_statement}解释:{expl}\n" - get_info_system = text_info_system.replace('{focus_statement}', focus_statement) - get_link_system = text_link_system.replace('{focus_statement}', focus_statement) - prompts = [get_link_system, get_info_system] + get_info_system = get_info_system.replace('{focus_statement}', focus_statement) + system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')},{get_info_system}" focus_points = [item["focuspoint"] for item in focus_points] time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) @@ -205,17 +98,11 @@ if __name__ == '__main__': continue _path = os.path.join(sample_dir, dirs) print(f'start testing {_path}') - if 'sample_recognized.json' not in os.listdir(_path): - print(f'{dirs} sample_recognized.json not found, use sample.json instead') - if 'sample.json' not in os.listdir(_path): - print(f'{dirs} sample.json not found, skip') - continue - sample_recognized = json.load(open(os.path.join(_path, 'sample.json'), 'r')) - else: - sample_recognized = json.load(open(os.path.join(_path, 'sample_recognized.json'), 'r')) - - link_dict = sample_recognized['link_dict'] - text = sample_recognized['text'] + if 'sample.json' not in os.listdir(_path): + print(f'{dirs} sample.json not found, skip') + continue + sample = json.load(open(os.path.join(_path, 'sample.json'), 'r')) + with open(record_file, 'a') as f: f.write(f"raw materials in: {dirs}\n\n") - asyncio.run(main(link_dict, text, record_file, prompts, focus_points)) + asyncio.run(main(sample['texts'], record_file, system_prompt, focus_points)) diff --git a/test/prompts.py b/test/prompts.py index 4cc8caa..0622f3c 100644 --- a/test/prompts.py +++ b/test/prompts.py @@ -1,15 +1,21 @@ -get_info_system = '''你将被给到一段使用标签包裹的网页文本,你的任务是从前到后仔细阅读文本,并提取出所有与如下关注点之一相关的部分。关注点列表及其解释如下: +get_info_system = '''你将被给到一段使用标签包裹的网页文本,你的任务是从前到后仔细阅读文本,并摘抄与如下关注点相关的原文片段。关注点及其解释如下: {focus_statement}\n 在进行提取时,请遵循以下原则: -- 理解每个关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围 -- 有必要的话,可以连同相关的上下文一并提取,从而保证提取出的内容信息完备、意思完整''' +- 理解关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围 +- 在满足上面原则的前提下,摘抄出全部相关片段 +- 摘抄出的原文片段务必保持原文原样,包括标点符号都不要更改,尤其注意保留类似"[3]"这样的引用标记''' -get_info_suffix = '''如果网页文本中包含关注点相关的部分,请按照以下json格式输出: -"""{"focus": 关注点, "content": 提取的内容}""" - -如果有多个相关部分,请逐条输出,每一条都用三引号包裹,三引号内不要有其他内容。''' +get_info_suffix = '''请将摘抄出的原文片段用"//"分隔,并整体用三引号包裹后输出。三引号内不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。 +如下是输出格式示例:: +""" +原文片段1 +// +原文片段2 +// +... +"""''' text_info_system = '''你将被给到一段使用标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下: