mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
method to seperate links area from content
This commit is contained in:
parent
aa49216acb
commit
77c3914d12
@ -54,7 +54,7 @@ async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
|
|||||||
finally:
|
finally:
|
||||||
semaphore.release()
|
semaphore.release()
|
||||||
|
|
||||||
if logger:
|
if logger and resp:
|
||||||
logger.debug(f'result:\n {response.choices[0]}')
|
logger.debug(f'result:\n {response.choices[0]}')
|
||||||
logger.debug(f'usage:\n {response.usage}')
|
logger.debug(f'usage:\n {response.usage}')
|
||||||
return resp
|
return resp
|
||||||
|
@ -49,34 +49,34 @@ def normalize_url(url: str, base_url: str) -> str:
|
|||||||
return _ss[0] + '//' + '/'.join(_ss[1:])
|
return _ss[0] + '//' + '/'.join(_ss[1:])
|
||||||
|
|
||||||
|
|
||||||
def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], dict]:
|
def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple[dict, list[str], list[str]]:
|
||||||
link_dict = {}
|
link_dict = {}
|
||||||
to_be_recognized_by_visual_llm = {}
|
to_be_recognized_by_visual_llm = {}
|
||||||
def check_url_text(text):
|
|
||||||
# text = text.strip()
|
|
||||||
# for special url formate from crawl4ai 0.4.247
|
# for special url formate from crawl4ai 0.4.247
|
||||||
text = re.sub(r'<javascript:.*?>', '<javascript:>', text).strip()
|
raw_markdown = re.sub(r'<javascript:.*?>', '<javascript:>', raw_markdown).strip()
|
||||||
|
|
||||||
# 处理图片标记 ![alt](src)
|
# 处理图片标记 ![alt](src)
|
||||||
img_pattern = r'(!\[(.*?)\]\((.*?)\))'
|
i_pattern = r'(!\[(.*?)\]\((.*?)\))'
|
||||||
matches = re.findall(img_pattern, text)
|
matches = re.findall(i_pattern, raw_markdown, re.DOTALL)
|
||||||
for _sec, alt, src in matches:
|
for _sec, alt, src in matches:
|
||||||
# 替换为新格式 §alt||src§
|
# 替换为新格式 §alt||src§
|
||||||
text = text.replace(_sec, f'§{alt}||{src}§', 1)
|
raw_markdown = raw_markdown.replace(_sec, f'§{alt}||{src}§', 1)
|
||||||
|
|
||||||
|
def check_url_text(text) -> tuple[int, str]:
|
||||||
|
score = 0
|
||||||
|
_valid_len = len(text.strip())
|
||||||
# 找到所有[part0](part1)格式的片段
|
# 找到所有[part0](part1)格式的片段
|
||||||
link_pattern = r'(\[(.*?)\]\((.*?)\))'
|
link_pattern = r'(\[(.*?)\]\((.*?)\))'
|
||||||
matches = re.findall(link_pattern, text)
|
matches = re.findall(link_pattern, text, re.DOTALL)
|
||||||
for _sec, link_text, link_url in matches:
|
for _sec, link_text, link_url in matches:
|
||||||
print("found link sec:", _sec)
|
|
||||||
# 处理 \"***\" 格式的片段
|
# 处理 \"***\" 格式的片段
|
||||||
quote_pattern = r'\"(.*?)\"'
|
quote_pattern = r'\"(.*?)\"'
|
||||||
# 提取所有引号包裹的内容
|
# 提取所有引号包裹的内容
|
||||||
_title = ''.join(re.findall(quote_pattern, link_url))
|
_title = ''.join(re.findall(quote_pattern, link_url, re.DOTALL))
|
||||||
|
|
||||||
# 分离§§内的内容和后面的内容
|
# 分离§§内的内容和后面的内容
|
||||||
img_marker_pattern = r'§(.*?)\|\|(.*?)§'
|
img_marker_pattern = r'§(.*?)\|\|(.*?)§'
|
||||||
inner_matches = re.findall(img_marker_pattern, link_text)
|
inner_matches = re.findall(img_marker_pattern, link_text, re.DOTALL)
|
||||||
for alt, src in inner_matches:
|
for alt, src in inner_matches:
|
||||||
link_text = link_text.replace(f'§{alt}||{src}§', '')
|
link_text = link_text.replace(f'§{alt}||{src}§', '')
|
||||||
link_text = link_text.strip()
|
link_text = link_text.strip()
|
||||||
@ -113,20 +113,21 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
|
|||||||
link_text = img_alt
|
link_text = img_alt
|
||||||
|
|
||||||
real_url_pattern = r'<(.*?)>'
|
real_url_pattern = r'<(.*?)>'
|
||||||
real_url = re.search(real_url_pattern, link_url)
|
real_url = re.search(real_url_pattern, link_url, re.DOTALL)
|
||||||
if real_url:
|
if real_url:
|
||||||
_url = real_url.group(1).strip()
|
_url = real_url.group(1).strip()
|
||||||
else:
|
else:
|
||||||
_url = re.sub(quote_pattern, '', link_url).strip()
|
_url = re.sub(quote_pattern, '', link_url, re.DOTALL).strip()
|
||||||
|
|
||||||
if not _url or _url.startswith(('#', 'javascript:')):
|
if not _url or _url.startswith(('#', 'javascript:')):
|
||||||
text = text.replace(_sec, link_text, 1)
|
text = text.replace(_sec, link_text, 1)
|
||||||
continue
|
continue
|
||||||
|
score += 1
|
||||||
|
_valid_len = _valid_len - len(_sec)
|
||||||
url = normalize_url(_url, base_url)
|
url = normalize_url(_url, base_url)
|
||||||
_key = f"[{len(link_dict)+1}]"
|
_key = f"[{len(link_dict)+1}]"
|
||||||
link_dict[_key] = url
|
link_dict[_key] = url
|
||||||
text = text.replace(_sec, link_text + _key, 1)
|
text = text.replace(_sec, link_text + _key, 1)
|
||||||
|
|
||||||
# 检查链接是否是常见文件类型或顶级域名
|
# 检查链接是否是常见文件类型或顶级域名
|
||||||
# todo: 最后提取是否添加到 more_link时或者主流程时再处理
|
# todo: 最后提取是否添加到 more_link时或者主流程时再处理
|
||||||
"""
|
"""
|
||||||
@ -137,17 +138,17 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
|
|||||||
"""
|
"""
|
||||||
# 处理文本中的其他图片标记
|
# 处理文本中的其他图片标记
|
||||||
img_pattern = r'(§(.*?)\|\|(.*?)§)'
|
img_pattern = r'(§(.*?)\|\|(.*?)§)'
|
||||||
matches = re.findall(img_pattern, text)
|
matches = re.findall(img_pattern, text, re.DOTALL)
|
||||||
remained_text = re.sub(img_pattern, '', text).strip()
|
remained_text = re.sub(img_pattern, '', text, re.DOTALL).strip()
|
||||||
remained_text_len = len(remained_text)
|
remained_text_len = len(remained_text)
|
||||||
for _sec, alt, src in matches:
|
for _sec, alt, src in matches:
|
||||||
if not src or src.startswith('#'):
|
if not src or src.startswith('#') or src not in used_img:
|
||||||
text = text.replace(_sec, alt, 1)
|
text = text.replace(_sec, alt, 1)
|
||||||
continue
|
continue
|
||||||
img_src = normalize_url(src, base_url)
|
img_src = normalize_url(src, base_url)
|
||||||
if not img_src:
|
if not img_src:
|
||||||
text = text.replace(_sec, alt, 1)
|
text = text.replace(_sec, alt, 1)
|
||||||
elif src not in used_img or remained_text_len > 5 or len(alt) > 2:
|
elif remained_text_len > 5 or len(alt) > 2:
|
||||||
_key = f"[img{len(link_dict)+1}]"
|
_key = f"[img{len(link_dict)+1}]"
|
||||||
link_dict[_key] = img_src
|
link_dict[_key] = img_src
|
||||||
text = text.replace(_sec, alt + _key, 1)
|
text = text.replace(_sec, alt + _key, 1)
|
||||||
@ -165,7 +166,6 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
|
|||||||
_key = f"[img{len(link_dict)+1}]"
|
_key = f"[img{len(link_dict)+1}]"
|
||||||
link_dict[_key] = img_src
|
link_dict[_key] = img_src
|
||||||
text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1)
|
text = text.replace(_sec, to_be_recognized_by_visual_llm[img_src] + _key, 1)
|
||||||
|
|
||||||
# 处理文本中的"野 url"
|
# 处理文本中的"野 url"
|
||||||
url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
|
url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
|
||||||
matches = re.findall(url_pattern, text)
|
matches = re.findall(url_pattern, text)
|
||||||
@ -174,22 +174,52 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: list[str]) -> tuple
|
|||||||
_key = f"[{len(link_dict)+1}]"
|
_key = f"[{len(link_dict)+1}]"
|
||||||
link_dict[_key] = url
|
link_dict[_key] = url
|
||||||
text = text.replace(url, _key, 1)
|
text = text.replace(url, _key, 1)
|
||||||
|
score += 1
|
||||||
|
_valid_len = _valid_len - len(url)
|
||||||
|
# 统计换行符数量
|
||||||
|
newline_count = text.count(' * ')
|
||||||
|
score += newline_count
|
||||||
|
ratio = _valid_len/score if score != 0 else 999
|
||||||
|
|
||||||
return text
|
return ratio, text
|
||||||
|
|
||||||
sections = raw_markdown.split('# ') # use '# ' to avoid # in url
|
sections = raw_markdown.split('# ') # use '# ' to avoid # in url
|
||||||
texts = []
|
if len(sections) > 2:
|
||||||
for i, section in enumerate(sections):
|
_sec = sections[0]
|
||||||
# filter the possible navigate section and footer section
|
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
|
||||||
section_remain = re.sub(r'\[.*?]\(.*?\)', '', section).strip()
|
|
||||||
section_remain_len = len(section_remain)
|
section_remain_len = len(section_remain)
|
||||||
total_links = len(re.findall(r'\[.*?]\(.*?\)', section))
|
total_links = len(re.findall(r'\[.*?]\(.*?\)', _sec, re.DOTALL))
|
||||||
print(f"section {i}")
|
ratio = total_links / section_remain_len if section_remain_len != 0 else 1
|
||||||
print(f"ratio: {total_links/section_remain_len}")
|
if ratio > 0.05:
|
||||||
|
print('this is a navigation section, will be removed')
|
||||||
processed_p = [check_url_text(p) for p in section.split('\n\n')]
|
print(ratio)
|
||||||
processed_p = [p for p in processed_p if p.strip()]
|
print(section_remain)
|
||||||
texts.append('\n\n'.join(processed_p))
|
print('-' * 50)
|
||||||
|
sections = sections[1:]
|
||||||
return link_dict, texts, to_be_recognized_by_visual_llm
|
_sec = sections[-1]
|
||||||
|
section_remain = re.sub(r'\[.*?]\(.*?\)', '', _sec, re.DOTALL).strip()
|
||||||
|
section_remain_len = len(section_remain)
|
||||||
|
if section_remain_len < 198:
|
||||||
|
print('this is a footer section, will be removed')
|
||||||
|
print(section_remain_len)
|
||||||
|
print(section_remain)
|
||||||
|
print('-' * 50)
|
||||||
|
sections = sections[:-1]
|
||||||
|
|
||||||
|
links_parts = []
|
||||||
|
contents = []
|
||||||
|
for section in sections:
|
||||||
|
ratio, text = check_url_text(section)
|
||||||
|
if ratio < 70:
|
||||||
|
print('this is a links part')
|
||||||
|
print(ratio)
|
||||||
|
print(text)
|
||||||
|
print('-' * 50)
|
||||||
|
links_parts.append(text)
|
||||||
|
else:
|
||||||
|
print('this is a content part')
|
||||||
|
print(ratio)
|
||||||
|
print(text)
|
||||||
|
print('-' * 50)
|
||||||
|
contents.append(text)
|
||||||
|
return link_dict, links_parts, contents
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import re
|
import re
|
||||||
from crawl4ai import CrawlResult
|
from crawl4ai import CrawlResult
|
||||||
@ -12,10 +14,21 @@ text_elements = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData:
|
def mp_scraper(fetch_result: CrawlResult | dict) -> ScraperResultData:
|
||||||
|
if isinstance(fetch_result, dict):
|
||||||
|
url = fetch_result['url']
|
||||||
|
raw_html = fetch_result['html']
|
||||||
|
cleaned_html = fetch_result['cleaned_html']
|
||||||
|
raw_markdown = fetch_result['markdown']
|
||||||
|
media = fetch_result['media']['images']
|
||||||
|
elif isinstance(fetch_result, CrawlResult):
|
||||||
url = fetch_result.url
|
url = fetch_result.url
|
||||||
raw_html = fetch_result.html
|
raw_html = fetch_result.html
|
||||||
cleaned_html = fetch_result.cleaned_html
|
cleaned_html = fetch_result.cleaned_html
|
||||||
|
raw_markdown = fetch_result.markdown
|
||||||
|
media = fetch_result.media['images']
|
||||||
|
else:
|
||||||
|
raise TypeError('fetch_result must be a CrawlResult or a dict')
|
||||||
|
|
||||||
content = ''
|
content = ''
|
||||||
images = []
|
images = []
|
||||||
@ -232,7 +245,8 @@ def mp_scraper(fetch_result: CrawlResult) -> ScraperResultData:
|
|||||||
else:
|
else:
|
||||||
author = None
|
author = None
|
||||||
publish_date = None
|
publish_date = None
|
||||||
content = fetch_result['markdown']
|
content = raw_markdown
|
||||||
|
images = [d['src'] for d in media]
|
||||||
|
|
||||||
elif num_sub_divs >= 2:
|
elif num_sub_divs >= 2:
|
||||||
# 2.2 如果包含两个及以上子块
|
# 2.2 如果包含两个及以上子块
|
||||||
|
@ -85,26 +85,18 @@ if __name__ == '__main__':
|
|||||||
for file in files:
|
for file in files:
|
||||||
if not file.endswith('.json'): continue
|
if not file.endswith('.json'): continue
|
||||||
|
|
||||||
#print(f"processing {file} ...")
|
print(f"processing {file} ...")
|
||||||
try:
|
try:
|
||||||
with open(file, 'r') as f:
|
with open(file, 'r') as f:
|
||||||
html_sample = json.load(f)
|
html_sample = json.load(f)
|
||||||
_url = html_sample['url']
|
_url = html_sample['url']
|
||||||
if _url.startswith('https://mp.weixin.qq.com'):
|
if _url.startswith('https://mp.weixin.qq.com'):
|
||||||
result = mp_scraper(html_sample)
|
result = mp_scraper(html_sample)
|
||||||
#print(f'url: {result.url}')
|
|
||||||
#print(f'content: {result.content}')
|
|
||||||
#print(f'links: {result.links}')
|
|
||||||
#print(f'author: {result.author}')
|
|
||||||
#print(f'publish_date: {result.publish_date}')
|
|
||||||
#print(f'images: {len(result.images)}')
|
|
||||||
#for img in result.images:
|
|
||||||
# print(img)
|
|
||||||
raw_markdown = result.content
|
raw_markdown = result.content
|
||||||
used_img = result.images
|
used_img = result.images
|
||||||
else:
|
else:
|
||||||
raw_markdown = html_sample['markdown']
|
raw_markdown = html_sample['markdown']
|
||||||
used_img = {d['src']: d['alt'] for d in html_sample['media']['images']}
|
used_img = [d['src'] for d in html_sample['media']['images']]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('sample format error, try to use craw4ai_fething.py to get sample')
|
print('sample format error, try to use craw4ai_fething.py to get sample')
|
||||||
print(f"error: {e}")
|
print(f"error: {e}")
|
||||||
@ -117,14 +109,14 @@ if __name__ == '__main__':
|
|||||||
base_url = base_url.rsplit('/', 1)[0] + '/'
|
base_url = base_url.rsplit('/', 1)[0] + '/'
|
||||||
|
|
||||||
time_start = time.time()
|
time_start = time.time()
|
||||||
link_dict, texts, to_be_recognized_by_visual_llm = deep_scraper(raw_markdown, base_url, used_img)
|
link_dict, links_part, contents = deep_scraper(raw_markdown, base_url, used_img)
|
||||||
time_end = time.time()
|
time_end = time.time()
|
||||||
#print(f"time cost for html: {time_end - time_start}s")
|
#print(f"time cost for html: {time_end - time_start}s")
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"link_dict": link_dict,
|
"link_dict": link_dict,
|
||||||
"texts": texts,
|
"links_part": links_part,
|
||||||
"to_be_recognized_by_visual_llm": to_be_recognized_by_visual_llm,
|
"contents": contents,
|
||||||
}
|
}
|
||||||
record_folder = file.replace('.json', '')
|
record_folder = file.replace('.json', '')
|
||||||
os.makedirs(record_folder, exist_ok=True)
|
os.makedirs(record_folder, exist_ok=True)
|
||||||
|
@ -4,168 +4,62 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from prompts import *
|
from prompts import *
|
||||||
# prompt 要加上今天是…………
|
from datetime import datetime
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
project_root = os.path.dirname(current_dir) # get parent dir
|
project_root = os.path.dirname(current_dir) # get parent dir
|
||||||
sys.path.append(project_root)
|
sys.path.append(project_root)
|
||||||
|
|
||||||
from core.llms.openai_wrapper import openai_llm as llm
|
from core.llms.openai_wrapper import openai_llm as llm
|
||||||
|
|
||||||
models = ['Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5', 'Qwen/Qwen2.5-72B-Instruct']
|
models = ['Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'deepseek-ai/DeepSeek-V2.5']
|
||||||
|
|
||||||
async def main(link_dict: dict, text: str, record_file: str, prompts: list, focus_points: list):
|
async def main(texts: list[str], record_file: str, sys_prompt: str, focus_points: list):
|
||||||
# first get more links
|
# first get more links
|
||||||
_to_be_processed = []
|
judge_text = ''.join(texts)
|
||||||
link_map = {}
|
|
||||||
for i, (url, des) in enumerate(link_dict.items()):
|
|
||||||
des = des.replace('\n', ' ')
|
|
||||||
_to_be_processed.append(f'<t{i+1}>//{des}//')
|
|
||||||
link_map[f'<t{i+1}'] = url
|
|
||||||
|
|
||||||
for model in models:
|
for model in models:
|
||||||
|
_texts = texts.copy()
|
||||||
print(f"running {model} ...")
|
print(f"running {model} ...")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
get_more_links_hallucination_times = 0
|
hallucination_times = 0
|
||||||
more_links = set()
|
|
||||||
text_batch = ''
|
text_batch = ''
|
||||||
for t in _to_be_processed:
|
cache = []
|
||||||
text_batch = f'{text_batch}{t}\n'
|
while _texts:
|
||||||
if len(text_batch) > 2048:
|
t = _texts.pop(0)
|
||||||
content = f'<text>\n{text_batch}</text>\n\n{text_link_suffix}'
|
text_batch = f'{text_batch}{t}# '
|
||||||
|
if len(text_batch) > 100 or len(_texts) == 0:
|
||||||
|
content = f'<text>\n{text_batch}</text>\n\n{get_info_suffix}'
|
||||||
result = await llm(
|
result = await llm(
|
||||||
[{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
|
[{'role': 'system', 'content': sys_prompt}, {'role': 'user', 'content': content}],
|
||||||
model=model, temperature=0.1)
|
model=model, temperature=0.1)
|
||||||
print(f"llm output\n{result}")
|
#print(f"llm output\n{result}")
|
||||||
text_batch = ''
|
text_batch = ''
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
||||||
result = result[-1]
|
if result: cache.append(result[-1])
|
||||||
for item in result.split('\n'):
|
|
||||||
if not item:
|
|
||||||
continue
|
|
||||||
segs = item.split('>')
|
|
||||||
if len(segs) != 2:
|
|
||||||
get_more_links_hallucination_times += 1
|
|
||||||
continue
|
|
||||||
_index, focus = segs
|
|
||||||
_index = _index.strip()
|
|
||||||
focus = focus.strip().strip('//')
|
|
||||||
if focus == 'NA':
|
|
||||||
continue
|
|
||||||
if focus not in focus_points or _index not in link_map:
|
|
||||||
get_more_links_hallucination_times += 1
|
|
||||||
continue
|
|
||||||
more_links.add(link_map[_index])
|
|
||||||
|
|
||||||
if text_batch:
|
|
||||||
content = f'<text>\n{text_batch}</text>\n\n{text_link_suffix}'
|
|
||||||
result = await llm(
|
|
||||||
[{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
|
|
||||||
model=model, temperature=0.1)
|
|
||||||
print(f"llm output\n{result}")
|
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
|
||||||
result = result[-1]
|
|
||||||
for item in result.split('\n'):
|
|
||||||
if not item:
|
|
||||||
continue
|
|
||||||
segs = item.split('>')
|
|
||||||
if len(segs) != 2:
|
|
||||||
get_more_links_hallucination_times += 1
|
|
||||||
continue
|
|
||||||
_index, focus = segs
|
|
||||||
_index = _index.strip()
|
|
||||||
focus = focus.strip().strip('//')
|
|
||||||
if focus == 'NA':
|
|
||||||
continue
|
|
||||||
if focus not in focus_points or _index not in link_map:
|
|
||||||
get_more_links_hallucination_times += 1
|
|
||||||
continue
|
|
||||||
more_links.add(link_map[_index])
|
|
||||||
|
|
||||||
t1 = time.time()
|
|
||||||
get_more_links_time = t1 - start_time
|
|
||||||
print(f"get more links time: {get_more_links_time}")
|
|
||||||
|
|
||||||
# second get more infos
|
|
||||||
lines = text.split('\n')
|
|
||||||
cache = set()
|
|
||||||
text_batch = ''
|
|
||||||
for line in lines:
|
|
||||||
text_batch = f'{text_batch}{line}\n'
|
|
||||||
if len(text_batch) > 5000:
|
|
||||||
#print(f"text_batch\n{text_batch}")
|
|
||||||
content = f'<text>\n{text_batch}</text>\n\n{text_info_suffix}'
|
|
||||||
result = await llm(
|
|
||||||
[{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}],
|
|
||||||
model=model, temperature=0.1)
|
|
||||||
print(f"llm output\n{result}")
|
|
||||||
text_batch = ''
|
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
|
||||||
cache.add(result[-1])
|
|
||||||
|
|
||||||
if text_batch:
|
|
||||||
#print(f"text_batch\n{text_batch}")
|
|
||||||
content = f'<text>\n{text_batch}</text>\n\n{text_info_suffix}'
|
|
||||||
result = await llm(
|
|
||||||
[{'role': 'system', 'content': prompts[1]}, {'role': 'user', 'content': content}],
|
|
||||||
model=model, temperature=0.1)
|
|
||||||
print(f"llm output\n{result}")
|
|
||||||
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
|
|
||||||
cache.add(result[-1])
|
|
||||||
|
|
||||||
get_infos_hallucination_times = 0
|
|
||||||
infos = []
|
infos = []
|
||||||
for item in cache:
|
for item in cache:
|
||||||
segs = item.split('//')
|
segs = item.split('//')
|
||||||
i = 0
|
infos.extend([s.strip() for s in segs if s.strip()])
|
||||||
while i < len(segs) - 1:
|
for content in infos:
|
||||||
focus = segs[i].strip()
|
if content not in judge_text:
|
||||||
if not focus:
|
print(f'not in raw content:\n{content}')
|
||||||
i += 1
|
hallucination_times += 1
|
||||||
continue
|
|
||||||
if focus not in focus_points:
|
t1 = time.time()
|
||||||
get_infos_hallucination_times += 1
|
get_infos_time = t1 - start_time
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
content = segs[i+1].strip().strip('摘要').strip(':').strip(':')
|
|
||||||
i += 2
|
|
||||||
if content and content != 'NA':
|
|
||||||
infos.append(f'{focus}: {content}')
|
|
||||||
"""
|
|
||||||
maybe can use embedding retrieval to judge
|
|
||||||
"""
|
|
||||||
t2 = time.time()
|
|
||||||
get_infos_time = t2 - t1
|
|
||||||
print(f"get more infos time: {get_infos_time}")
|
print(f"get more infos time: {get_infos_time}")
|
||||||
|
|
||||||
# get author and publish date from text
|
|
||||||
if len(text) > 1024:
|
|
||||||
usetext = f'{text[:500]}......{text[-500:]}'
|
|
||||||
else:
|
|
||||||
usetext = text
|
|
||||||
content = f'<text>\n{usetext}\n</text>\n\n{text_ap_suffix}'
|
|
||||||
llm_output = await llm([{'role': 'system', 'content': text_ap_system}, {'role': 'user', 'content': content}],
|
|
||||||
model=model, max_tokens=50, temperature=0.1)
|
|
||||||
print(f"llm output: {llm_output}")
|
|
||||||
ap_ = llm_output.strip().strip('"')
|
|
||||||
|
|
||||||
print("*" * 12)
|
print("*" * 12)
|
||||||
print('\n\n')
|
print('\n\n')
|
||||||
|
|
||||||
more_links_to_record = [f'{link_dict[link]}:{link}' for link in more_links]
|
|
||||||
more_links_to_record = '\n'.join(more_links_to_record)
|
|
||||||
infos_to_record = '\n'.join(infos)
|
infos_to_record = '\n'.join(infos)
|
||||||
|
|
||||||
with open(record_file, 'a') as f:
|
with open(record_file, 'a') as f:
|
||||||
f.write(f"llm model: {model}\n")
|
f.write(f"llm model: {model}\n")
|
||||||
f.write(f"get more links time: {get_more_links_time} s\n")
|
f.write(f"process time: {get_infos_time} s\n")
|
||||||
f.write(f"bad generate times during get more links: {get_more_links_hallucination_times}\n")
|
f.write(f"bad generate times: {hallucination_times}\n")
|
||||||
f.write(f"get more infos time: {get_infos_time} s\n")
|
f.write(f"total segments: {len(infos)}\n")
|
||||||
f.write(f"bad generate times during get more infos: {get_infos_hallucination_times}\n")
|
f.write(f"segments: \n{infos_to_record}\n")
|
||||||
f.write(f"total more links: {len(more_links)}\n")
|
|
||||||
f.write(f"total infos: {len(infos)}\n")
|
|
||||||
f.write(f"author and publish time: {ap_}\n")
|
|
||||||
f.write(f"infos: \n{infos_to_record}\n")
|
|
||||||
f.write(f"more links: \n{more_links_to_record}\n")
|
|
||||||
f.write("*" * 12)
|
f.write("*" * 12)
|
||||||
f.write('\n\n')
|
f.write('\n\n')
|
||||||
|
|
||||||
@ -190,9 +84,8 @@ if __name__ == '__main__':
|
|||||||
if expl:
|
if expl:
|
||||||
focus_statement = f"{focus_statement}解释:{expl}\n"
|
focus_statement = f"{focus_statement}解释:{expl}\n"
|
||||||
|
|
||||||
get_info_system = text_info_system.replace('{focus_statement}', focus_statement)
|
get_info_system = get_info_system.replace('{focus_statement}', focus_statement)
|
||||||
get_link_system = text_link_system.replace('{focus_statement}', focus_statement)
|
system_prompt = f"今天的日期是{datetime.now().strftime('%Y-%m-%d')},{get_info_system}"
|
||||||
prompts = [get_link_system, get_info_system]
|
|
||||||
focus_points = [item["focuspoint"] for item in focus_points]
|
focus_points = [item["focuspoint"] for item in focus_points]
|
||||||
|
|
||||||
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
|
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
|
||||||
@ -205,17 +98,11 @@ if __name__ == '__main__':
|
|||||||
continue
|
continue
|
||||||
_path = os.path.join(sample_dir, dirs)
|
_path = os.path.join(sample_dir, dirs)
|
||||||
print(f'start testing {_path}')
|
print(f'start testing {_path}')
|
||||||
if 'sample_recognized.json' not in os.listdir(_path):
|
|
||||||
print(f'{dirs} sample_recognized.json not found, use sample.json instead')
|
|
||||||
if 'sample.json' not in os.listdir(_path):
|
if 'sample.json' not in os.listdir(_path):
|
||||||
print(f'{dirs} sample.json not found, skip')
|
print(f'{dirs} sample.json not found, skip')
|
||||||
continue
|
continue
|
||||||
sample_recognized = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
|
sample = json.load(open(os.path.join(_path, 'sample.json'), 'r'))
|
||||||
else:
|
|
||||||
sample_recognized = json.load(open(os.path.join(_path, 'sample_recognized.json'), 'r'))
|
|
||||||
|
|
||||||
link_dict = sample_recognized['link_dict']
|
|
||||||
text = sample_recognized['text']
|
|
||||||
with open(record_file, 'a') as f:
|
with open(record_file, 'a') as f:
|
||||||
f.write(f"raw materials in: {dirs}\n\n")
|
f.write(f"raw materials in: {dirs}\n\n")
|
||||||
asyncio.run(main(link_dict, text, record_file, prompts, focus_points))
|
asyncio.run(main(sample['texts'], record_file, system_prompt, focus_points))
|
||||||
|
@ -1,15 +1,21 @@
|
|||||||
|
|
||||||
get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,你的任务是从前到后仔细阅读文本,并提取出所有与如下关注点之一相关的部分。关注点列表及其解释如下:
|
get_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,你的任务是从前到后仔细阅读文本,并摘抄与如下关注点相关的原文片段。关注点及其解释如下:
|
||||||
|
|
||||||
{focus_statement}\n
|
{focus_statement}\n
|
||||||
在进行提取时,请遵循以下原则:
|
在进行提取时,请遵循以下原则:
|
||||||
- 理解每个关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围
|
- 理解关注点的含义以及进一步的解释(如有),确保提取的内容与关注点强相关并符合解释(如有)的范围
|
||||||
- 有必要的话,可以连同相关的上下文一并提取,从而保证提取出的内容信息完备、意思完整'''
|
- 在满足上面原则的前提下,摘抄出全部相关片段
|
||||||
|
- 摘抄出的原文片段务必保持原文原样,包括标点符号都不要更改,尤其注意保留类似"[3]"这样的引用标记'''
|
||||||
|
|
||||||
get_info_suffix = '''如果网页文本中包含关注点相关的部分,请按照以下json格式输出:
|
get_info_suffix = '''请将摘抄出的原文片段用"//"分隔,并整体用三引号包裹后输出。三引号内不要有其他内容,如果文本中不包含任何与关注点相关的内容则保持三引号内为空。
|
||||||
"""{"focus": 关注点, "content": 提取的内容}"""
|
如下是输出格式示例::
|
||||||
|
"""
|
||||||
如果有多个相关部分,请逐条输出,每一条都用三引号包裹,三引号内不要有其他内容。'''
|
原文片段1
|
||||||
|
//
|
||||||
|
原文片段2
|
||||||
|
//
|
||||||
|
...
|
||||||
|
"""'''
|
||||||
|
|
||||||
text_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
|
text_info_system = '''你将被给到一段使用<text></text>标签包裹的网页文本,请分别按如下关注点对网页文本提炼摘要。关注点列表及其解释如下:
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user