add test for v0.3.6

This commit is contained in:
bigbrother666sh 2024-12-23 10:12:52 +08:00
parent 1416ab29c8
commit fd9d9f9a4e
10 changed files with 657 additions and 304 deletions

View File

@ -117,7 +117,6 @@ url2
return result['source'], extract_and_convert_dates(result['publish_date'])
async def get_more_related_urls(self, link_dict: dict, og_url: str) -> set[str]:
"""
if not link_dict:
return set()
self.logger.debug(f'{len(link_dict)} items to analyze')
@ -155,8 +154,6 @@ url2
self.logger.warning(f"{hallucination_urls} not in link_dict, it's model's Hallucination")
return urls & raw_urls
"""
return set()
async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]:
if not text:

View File

@ -5,6 +5,7 @@ export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and
##belowing is optional, go as you need
#export VERBOSE="true" ##for detail log info. If not need, remove this item.
#export PRIMARY_MODEL="Qwen/Qwen2.5-14B-Instruct"
#export SECONDARY_MODEL="THUDM/glm-4-9b-chat"
#export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct"
#export VL_MODEL="OpenGVLab/InternVL2-26B"
export PROJECT_DIR="work_dir"
#export PB_API_BASE="" ##only use if your pb not run on 127.0.0.1:8090

View File

@ -4,6 +4,6 @@ download https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/
cd pb
xattr -d com.apple.quarantine pocketbase # for Macos
./pocketbase migrate up # for first run
./pocketbase --dev admin create test@example.com 1234567890 # If you don't have an initial account, please use this command to create it
./pocketbase --dev superuser create "test@example.com" "1234567890" # If you don't have an initial account, please use this command to create it
./pocketbase serve
```
```

View File

@ -1 +1,12 @@
![alt text](image.png)
| 模型 | 提示语言 | 漏字 | 不遵守指令 | 识别错误 | 幻觉 | 总分 | 评价 |
|------|----------|------|------------|----------|------|------|------|
| Qwen/Qwen2-VL-72B-Instruct | cn prompt | 2 | 1 | 3 | 0 | 6 | |
| | en prompt | 2 | 1 | 1 | 0 | 4 | 👍 |
| OpenGVLab/InternVL2-26B | cn prompt | 1 | 0 | 2 | 0 | 3 | 👍👍 |
| | en prompt | 0 | 2 | 3 | 0 | 5 | |
| Pro/Qwen/Qwen2-VL-7B-Instruct | cn prompt | 1 | 1 | 2 | 1 | 5 | |
| | en prompt | 0 | 2 | 3 | 0 | 5 | |
| Pro/OpenGVLab/InternVL2-8B | cn prompt | 3 | 2 | 2 | 0 | 7 | |
| | en prompt | 2 | 2 | 4 | 1 | 9 | |
| deepseek-ai/deepseek-vl2 | cn prompt | 1 | 1 | 1 | 1 | 4 | 👍 |
| | en prompt | 3 | 0 | 1 | 4 | 8 | |

View File

@ -9,10 +9,10 @@ from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingCont
from datetime import timedelta
sites = ["https://cryptopanic.com/news/"]
sites = ["https://www.gd121.cn/zx/qxzx/list.shtml",
]
os.environ['CRAWLEE_STORAGE_DIR'] = 'test/webpage_samples/crawlee_storage'
os.environ['CRAWLEE_STORAGE_DIR'] = 'webpage_samples/crawlee_storage'
save_dir = 'webpage_samples'
async def main(sites: list):
@ -25,80 +25,340 @@ async def main(sites: list):
@crawler.pre_navigation_hook
async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None:
context.log.info(f'navigeting {context.request.url} ...')
context.log.info(f'navigating {context.request.url} ...')
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
# context.log.info(f'Processing {context.request.url} ...')
# Handle dialogs (alerts, confirms, prompts)
await context.page.wait_for_load_state('networkidle')
await context.page.wait_for_timeout(2000)
# Handle dialogs (alerts, confirms, prompts)
async def handle_dialog(dialog):
context.log.info(f'Closing dialog: {dialog.message}')
await dialog.accept()
context.page.on('dialog', handle_dialog)
# 尝试查找并点击 "Accept" 按钮
button_texts = ['Accept', 'Allow', 'Close']
button_selectors = ['.close-btn', '.accept-button', '.allow-button']
# 等待弹窗出现并尝试关闭
for text in button_texts:
try:
context.log.info(f'等待按钮: {text} 可见...')
await context.page.wait_for_selector(f'button:text("{text}")', state='visible', timeout=5000) # 等待最多5秒
await context.page.locator(f'button:text("{text}")').click()
context.log.info(f'点击按钮: {text}')
await context.page.wait_for_timeout(1000)
except Exception as e:
context.log.error(f'未能点击按钮: {text},错误: {e}')
for selector in button_selectors:
try:
context.log.info(f'等待选择器: {selector} 可见...')
await context.page.wait_for_selector(selector, state='visible', timeout=5000) # 等待最多5秒
await context.page.locator(selector).click()
context.log.info(f'点击选择器: {selector}')
await context.page.wait_for_timeout(1000)
except Exception as e:
context.log.error(f'未能点击选择器: {selector},错误: {e}')
context.log.info('successfully finish fetching')
folder = os.path.join(save_dir, f"{hashlib.sha256(context.request.url.encode()).hexdigest()[-6:]}")
os.makedirs(folder, exist_ok=True)
html = await context.page.inner_html('head')
soup = BeautifulSoup(html, 'html.parser')
web_title = soup.find('title')
if web_title:
web_title = web_title.get_text().strip()
else:
web_title = ''
base_tag = soup.find('base', href=True)
if base_tag and base_tag.get('href'):
base_url = base_tag['href']
else:
# if no base tag, use the current url as base url
parsed_url = urlparse(context.request.url)
domain = parsed_url.netloc
base_url = f"{parsed_url.scheme}://{domain}"
html = await context.page.inner_html('body')
context.log.info('successfully finish fetching')
existing_urls = set()
parsed_url = urlparse(context.request.url)
domain = parsed_url.netloc
text = await context.page.inner_text('body')
with open(os.path.join(folder, 'text.txt'), 'w') as f:
f.write(text)
# to use a customer scaper here
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
base_url = f"{parsed_url.scheme}://{domain}"
# 移除导航、页眉、页脚等通用元素
for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']:
elements = soup.select(selector)
for element in elements:
element.decompose()
action_dict = {}
for form in soup.find_all('form', recursive=True):
form_dict = {}
for input_elem in form.find_all('input'):
input_type = input_elem.get('type', 'text')
input_name = input_elem.get('name', f'input_{len(action_dict)}')
input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
input_dict = {
"type": input_type,
"values": [input_value] if input_value else []
}
# handle datalist
if input_elem.get('list'):
datalist = soup.find('datalist', id=input_elem['list'])
if datalist:
options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
input_dict = {
"type": "text",
"values": [f"one of followings: {options}"]
}
form_dict[input_name] = input_dict
for select in form.find_all('select'):
select_name = select.get('name', f'select_{len(form_dict)}')
options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')]
form_dict[select_name] = {
"type": "select",
"values": options
}
for textarea in form.find_all('textarea'):
textarea_name = textarea.get('name', f'textarea_{len(form_dict)}')
form_dict[textarea_name] = {
"type": "textarea",
"values": [textarea.text.strip()]
}
if form_dict:
form_id = form.get('id', f'form_{len(action_dict)}')
action_dict[form_id] = form_dict
form.decompose()
# handle input elements that are not in any form
for input_elem in soup.find_all('input', recursive=True):
if input_elem.find_parent('form') is None:
# check if the input is associated with a form by form attribute
form_ids = input_elem.get('form', '').split()
# handle input element
input_type = input_elem.get('type', 'text')
input_name = input_elem.get('name', f'input_{len(action_dict)}')
input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']])
input_dict = {
"type": input_type,
"values": [input_value] if input_value else []
}
# handle datalist
if input_elem.get('list'):
datalist = soup.find('datalist', id=input_elem['list'])
if datalist:
options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')]
input_dict = {
"type": "text",
"values": [f"one of followings: {options}"]
}
# decide the placement of the input element based on form attribute
if form_ids:
for form_id in form_ids:
if form_id in action_dict:
action_dict[form_id][input_name] = input_dict
else:
action_dict[form_id] = {input_name: input_dict}
else:
action_dict[input_name] = {"input": input_dict}
input_elem.decompose()
for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True):
button_name = button.get('name', '') or button.get('id', '') or button.text.strip()
if not button_name:
button_name = f'button_{len(action_dict)}'
button_type = button.get('type', 'button')
button_value = button.get('value', button.text.strip())
action_dict[button_name] = {
"button": {
"type": button_type,
"values": [button_value] if button_value else []
}
}
button.decompose()
for command in soup.find_all('command', recursive=True):
command_name = command.get('name', '') or command.get('id', '') or command.text.strip()
if not command_name:
command_name = f'command_{len(action_dict)}'
command_type = command.get('type', 'command')
command_value = command.get('value', command.text.strip())
action_dict[command_name] = {
"command": {
"type": command_type,
"values": [command_value] if command_value else []
}
}
command.decompose()
link_dict = {}
for a in links:
new_url = a.get('href')
if new_url.startswith('javascript:') or new_url.startswith('#') or new_url.startswith('mailto:'):
continue
if new_url in [context.request.url, base_url]:
continue
if new_url in existing_urls:
continue
t = a.text.strip()
if new_url and t:
link_dict[t] = urljoin(base_url, new_url)
existing_urls.add(new_url)
for img in soup.find_all('img', src=True, recursive=True):
src = img.get('src')
if src.startswith('#') or src.startswith('about:blank'):
src = None
text = img.get('alt', '').strip()
if src:
if not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
key = f"url{len(link_dict)}"
link_dict[key] = src
text = f"{text}<img>[{key}]"
# find all area urls related to this img
area_urls = set()
if img.get('usemap'):
# remove the # at the beginning of the map name
map_name = img.get('usemap').lstrip('#')
# find the map tag
map_tag = soup.find('map', {'name': map_name})
if map_tag:
# get all area tags under the map
for area in map_tag.find_all('area', href=True):
area_href = area.get('href')
if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'):
area_href = None
if area_href:
if not area_href.startswith(('http://', 'https://')):
area_href = urljoin(base_url, area_href)
area_urls.add(area_href)
area.decompose()
# delete the whole map tag
map_tag.decompose()
for area_url in area_urls:
if area_url in [context.request.url, base_url]:
continue
key = f"url{len(link_dict)}"
link_dict[key] = area_url
text = f"{text}[{key}]"
img.replace_with(f"-{text}")
for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True):
src = media.get('src')
if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'):
src = None
text = media.get('alt', '').strip() or media.get_text().strip()
if src:
# convert relative path to full url
if not src.startswith(('http://', 'https://')):
src = urljoin(context.request.url, src)
key = f"url{len(link_dict)}"
link_dict[key] = src
ext = os.path.splitext(src)[1].lstrip('.') or media.name
text = f"{text}<{ext}>[{key}]"
media.replace_with(f"-{text}")
for obj in soup.find_all('object', data=True, recursive=True):
data = obj.get('data')
if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'):
data = None
text = obj.get('title', '').strip() or obj.get_text().strip()
if data:
# convert relative path to full url
if not data.startswith(('http://', 'https://')):
data = urljoin(context.request.url, data)
key = f"url{len(link_dict)}"
link_dict[key] = data
ext = os.path.splitext(data)[1].lstrip('.') or 'object'
text = f"{text}<{ext}>[{key}]"
obj.replace_with(f"-{text}")
# process links at last, so that we can keep the image and media info in the link
for a in soup.find_all('a', href=True, recursive=True):
href = a.get('href')
if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'):
href = None
if href:
text = a.get_text().strip() or '-'
if not href.startswith(('http://', 'https://')):
href = urljoin(context.request.url, href)
if href in [context.request.url, base_url]:
continue
key = f"url{len(link_dict)}"
link_dict[key] = href
a.replace_with(f"{text}[{key}]")
# handle headings
for i in range(1, 7): # h1 到 h6
for heading in soup.find_all(f'h{i}', recursive=False):
text = heading.get_text().strip()
heading.replace_with(f"{'#' * i} {text}\n")
# replace all <br> and <br/> tags with newlines
for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True):
br.replace_with('\n')
# handle lists
for list_tag in soup.find_all(['ul', 'ol'], recursive=True):
list_text = []
for idx, item in enumerate(list_tag.find_all('li')):
list_text.append(f"{idx + 1}. {item.get_text().strip()}")
list_text = '\t'.join(list_text)
list_tag.replace_with(f"{list_text}\n")
# handle spans - merge span text with surrounding text
for span in soup.find_all('span', recursive=True):
span.replace_with(span.get_text().strip())
# handle strikethrough text
for del_tag in soup.find_all(['del', 's'], recursive=True):
del_text = del_tag.get_text().strip()
if del_text:
del_tag.replace_with(f"{del_text}(maybe_outdated)")
else:
del_tag.decompose()
# handle tables
for table in soup.find_all('table', recursive=True):
table_text = []
# handle caption
caption = table.find('caption')
if caption:
table_text.append(caption.get_text().strip())
# get headers
headers = []
for th in table.find_all('th'):
headers.append(th.get_text().strip())
# handle all rows (including tbody and tfoot)
for row in table.find_all('tr'):
# get the first cell value
# try to find th as first_val
first_cell = row.find(['th', 'td'])
if not first_cell:
continue
first_val = first_cell.get_text().strip()
cells = row.find_all('td')
if not cells:
continue
# handle remaining cells
for idx, cell in enumerate(cells):
cell_text = cell.get_text().strip()
if not cell_text or cell_text == first_val:
continue
header_text = headers[idx] if idx < len(headers) else ''
cell_str = f"{first_val}-{header_text}-{cell_text}"
table_text.append(cell_str)
# replace the table with the processed text
table_text = '\n'.join(table_text)
table.replace_with(f"\n{table_text}\n")
html_text = soup.get_text(strip=False, separator='\n')
with open(os.path.join(folder, 'text.txt'), 'w') as f:
f.write(html_text)
with open(os.path.join(folder, 'link_dict.json'), 'w', encoding='utf-8') as f:
json.dump(link_dict, f, indent=4, ensure_ascii=False)
with open(os.path.join(folder, 'action_dict.json'), 'w', encoding='utf-8') as f:
json.dump(action_dict, f, indent=4, ensure_ascii=False)
links_number_from_html = len(link_dict)
print(f"links number from html: {links_number_from_html}")
screenshot_file = os.path.join(folder, 'screenshot.jpg')
await context.page.screenshot(path=screenshot_file, full_page=True)
# screenshot_file = os.path.join(folder, 'screenshot.jpg')
# await context.page.screenshot(path=screenshot_file, full_page=True)
await crawler.run(sites)

View File

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
import os, re
import json
import time
sample_dir = 'webpage_samples'
list_judge_threshold = 0.007
valid_list_min_length = 10
min_content_length = 420
common_file_exts = [
'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8',
'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm',
'zip', 'rar', '7z', 'tar', 'gz', 'bz2',
'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx',
'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp'
]
common_tlds = [
'.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co',
'.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev',
'.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site',
'.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work'
]
def find_article_or_list(link_dict, text) -> (bool, bool, str):
lines = [l.strip() for l in text.split('\n') if l.strip()]
text = '\n'.join(lines)
text_no_tags = re.sub(r'<\w{1,5}>', '', text)
text_no_urls = re.sub(r'\[url\d+]', '', text_no_tags)
content_length = len(text_no_urls)
valid_url_count = 0
for url in link_dict.values():
url_lower = url.lower()
has_common_ext = any(url_lower.endswith(ext) for ext in common_file_exts)
has_common_tld = any(url_lower.endswith(tld) for tld in common_tlds)
if not has_common_ext and not has_common_tld:
valid_url_count += 1
valid_url_rate = valid_url_count / content_length
is_list = valid_url_rate > 0.007 and valid_url_count > valid_list_min_length
need_more_info = content_length < min_content_length
return is_list, need_more_info, text
if __name__ == '__main__':
dirs = os.listdir(sample_dir)
for _dir in dirs:
if not _dir.startswith('task'):
continue
_path = os.path.join(sample_dir, _dir)
if not os.path.isdir(_path):
continue
samples = os.listdir(_path)
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
record_file = os.path.join(_path, f'article_or_list_judge.txt')
for sample in samples:
if not os.path.isdir(os.path.join(_path, sample)):
continue
files = os.listdir(os.path.join(_path, sample))
if 'link_dict.json' not in files or 'text.txt' not in files:
print(f'{sample} files not complete, skip')
continue
link_dict = json.load(open(os.path.join(_path, sample, 'link_dict.json'), 'r'))
text = open(os.path.join(_path, sample, 'text.txt'), 'r').read()
is_list, need_more_info, text = find_article_or_list(link_dict, text)
with open(record_file, 'a') as f:
f.write(f"raw materials: {sample}\n\n")
f.write(f"cleaned text: \n{text}\n\n")
f.write("list\n" if is_list else "article\n")
f.write("need more info\n" if need_more_info else "no need more info\n")
f.write("*" * 12)
f.write('\n\n')

View File

@ -3,113 +3,140 @@
import os, re
import json
import asyncio
import time, base64
from info_test_prompts import *
import time
from prompts import *
import json_repair
from llms.openai_wrapper import openai_llm as llm
from openai_wrapper import openai_llm as llm
from find_article_or_list import find_article_or_list, common_tlds, common_file_exts
sample_dir = 'test/webpage_samples'
sample_dir = 'webpage_samples'
models = ['deepseek-ai/DeepSeek-V2.5', 'Qwen/Qwen2.5-Coder-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-Coder-7B-Instruct']
vl_models = ['Qwen/Qwen2-VL-72B-Instruct', 'OpenGVLab/InternVL2-26B', 'TeleAI/TeleMM', 'Pro/Qwen/Qwen2-VL-7B-Instruct', 'Pro/OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL2-Llama3-76B']
secondary_mpdel = 'Qwen/Qwen2.5-7B-Instruct'
vl_model = ''
async def generate_results(text, model, system_prompt, suffix_prompt) -> set:
lines = text.split('\n')
cache = set()
text_batch = ''
for line in lines:
text_batch = f'{text_batch}\n{line}'
if len(text_batch) > 1024:
content = f'<text>\n{text_batch}\n</text>\n\n{suffix_prompt}'
result = await llm(
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
model=model, temperature=0.1)
print(f"llm output: {result}")
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
if not result:
print(f"warning: bad generate result")
text_batch = ''
continue
result = result[0].strip()
result = result.split('\n')
cache.update(result)
text_batch = ''
if text_batch:
content = f'<text>\n{text_batch}\n</text>\n\n{suffix_prompt}'
result = await llm(
[{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}],
model=model, temperature=0.1)
print(f"llm output: {result}")
result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL)
if not result:
print(f"warning: bad generate result")
return cache
result = result[0].strip()
result = result.split('\n')
cache.update(result)
return cache
async def extract_info_from_img(text, link_dict) -> str:
cache = {}
pattern = r'<img>\[url\d+\]'
matches = re.findall(pattern, text)
for match in matches:
key = match.split('[url')[1][:-1]
url = link_dict.get(f'url{key}', '')
if not url:
continue
if url in cache:
replace_text = cache[url]
else:
if any(url.lower().endswith(tld) for tld in common_tlds):
continue
if any(url.lower().endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
continue
llm_output = await llm([{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
{"type": "text", "text": image_system}]}], model='OpenGVLab/InternVL2-26B')
print(f"vl model output: \n{llm_output}\n")
replace_text = llm_output
cache[url] = replace_text
text = text.replace(match, f'{replace_text}{match}', 1)
return text
async def main(link_dict, text, record_file, prompts):
is_list, need_more_info, text = find_article_or_list(link_dict, text)
if is_list:
print("may be a article list page, get more urls ...")
system_prompt = prompts[1]
suffix_prompt = text_link_suffix
else:
if need_more_info:
print("may be a article page need to get more text from images...")
text = await extract_info_from_img(text, link_dict)
print(f"extended text: \n{text}\n")
system_prompt = prompts[0]
suffix_prompt = text_info_suffix
async def main(link_dict, text, screenshot_file, record_file, prompts):
for model in models:
print(f"running {model} ...")
start_time = time.time()
hallucination_times = 0
# got more links from text
# more_urls = set()
more_url_text = set()
content = ''
for key in link_dict.keys():
content = f"{content}{key}\n"
if len(content) > 512:
result = await llm([{'role': 'system', 'content': prompts[1]},
{'role': 'user', 'content': f'<text>\n{content}\n</text>\n\n{text_link_suffix}'}],
model=model, temperature=0.1)
print(f"llm output: {result}")
result = re.findall(r'"""(.*?)"""', result, re.DOTALL)
if result:
result = result[0].strip()
result = result.split('\n')
# more_urls.update({link_dict[t] for t in result if t in link_dict})
more_url_text.update({f"{t}: {link_dict[t]}" for t in result if t in link_dict})
else:
hallucination_times += len(result) - len({t for t in result if t in link_dict})
content = ''
if content:
result = await llm([{'role': 'system', 'content': prompts[1]},
{'role': 'user', 'content': f'<text>\n{content}\n</text>\n\n{text_link_suffix}'}],
model=model, temperature=0.1)
print(f"llm output: {result}")
result = re.findall(r'"""(.*?)"""', result, re.DOTALL)
if result:
result = result[0].strip()
result = result.split('\n')
# more_urls.update({link_dict[t] for t in result if t in link_dict})
more_url_text.update({f"{t}: {link_dict[t]}" for t in result if t in link_dict})
else:
hallucination_times += len(result) - len({t for t in result if t in link_dict})
more_url_text = '\n'.join(more_url_text)
print(f"time spent: {time.time() - start_time}")
# get infos from text
infos = []
lines = text.split('\n')
cache = ''
for line in lines:
cache = f'{cache}{line}'
if len(cache) > 2048:
content = f'<text>\n{cache}\n</text>\n\n{text_info_suffix}'
result = await llm(
[{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
model=model, temperature=0.1, response_format={"type": "json_object"})
print(f"llm output: {result}")
cache = ''
if not result:
hallucination_times += 1
continue
result = json_repair.repair_json(result, return_objects=True)
if not isinstance(result, list):
hallucination_times += 1
continue
if not result:
hallucination_times += 1
continue
infos.extend(result)
if cache:
content = f'<text>\n{cache}\n</text>\n\n{text_info_suffix}'
result = await llm([{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}],
model=model, temperature=0.1, response_format={"type": "json_object"})
print(f"llm output: {result}")
if not result:
hallucination_times += 1
result = json_repair.repair_json(result, return_objects=True)
if not isinstance(result, list):
hallucination_times += 1
if not result:
hallucination_times += 1
infos.extend(result)
final_infos = []
for item in infos:
if 'focus' not in item or 'content' not in item:
hallucination_times += 1
continue
if not item['content']:
hallucination_times += 1
continue
if item['content'] in link_dict:
continue
raw_result = await generate_results(text, model, system_prompt, suffix_prompt)
final_result = set()
for item in raw_result:
if is_list:
if '[url' not in item:
hallucination_times += 1
continue
# 从item中提取[]中的url标记
url_tag = re.search(r'\[(.*?)]', item).group(1)
if url_tag not in link_dict:
hallucination_times += 1
continue
result_url = link_dict[url_tag]
if any(result_url.lower().endswith(tld) for tld in common_tlds):
continue
if any(result_url.lower().endswith(ext) for ext in common_file_exts):
continue
final_result.add(item)
else:
result = json_repair.repair_json(item, return_objects=True)
if not isinstance(result, dict):
hallucination_times += 1
continue
if not result:
hallucination_times += 1
continue
if 'focus' not in result or 'content' not in result:
hallucination_times += 1
continue
if not result['content'].strip() or not result['focus'].strip():
hallucination_times += 1
continue
if result['focus'].startswith('#'):
result['focus'] = result['focus'][1:]
final_result.add(result)
final_infos.append(f"{item['focus']}: {item['content']}")
final_infos = '\n'.join(final_infos)
print(f"time spent: {time.time() - start_time}")
final_infos = '\n'.join(final_result)
# get author and publish date from text
if len(text) > 1024:
@ -142,7 +169,7 @@ async def main(link_dict, text, screenshot_file, record_file, prompts):
f.write(f"total analysis time: {total_analysis_time}\n\n")
f.write(f"author and publish time(not formated): {ap_}\n")
f.write(f"infos(not formated): \n{final_infos}\n")
f.write(f"more urls: \n{more_url_text}\n\n")
#f.write(f"more urls: \n{more_url_text}\n\n")
f.write("*" * 12)
f.write('\n\n')
@ -150,7 +177,7 @@ async def main(link_dict, text, screenshot_file, record_file, prompts):
if __name__ == '__main__':
dirs = os.listdir(sample_dir)
for _dir in dirs:
if not _dir.startswith('task'):
if not _dir.startswith('task0'):
continue
_path = os.path.join(sample_dir, _dir)
if not os.path.isdir(_path):
@ -168,11 +195,8 @@ if __name__ == '__main__':
focus_statement = f"{focus_statement}解释:{expl}\n"
print(f'start testing {_dir}')
print(f"focus statement: {focus_statement}")
get_info_system = text_info_system.replace('{focus_statement}', focus_statement)
get_link_system = text_link_system.replace('{focus_statement}', focus_statement)
#get_info_system = image_info_system.replace('{focus_statement}', focus_statement)
#get_link_system = image_link_system.replace('{focus_statement}', focus_statement)
prompts = [get_info_system, get_link_system]
samples = os.listdir(_path)
@ -184,130 +208,11 @@ if __name__ == '__main__':
if not os.path.isdir(os.path.join(_path, sample)):
continue
files = os.listdir(os.path.join(_path, sample))
if 'link_dict.json' not in files or 'text.txt' not in files or 'screenshot.jpg' not in files:
if 'link_dict.json' not in files or 'text.txt' not in files:
print(f'{sample} files not complete, skip')
continue
link_dict = json.load(open(os.path.join(_path, sample, 'link_dict.json'), 'r'))
text = open(os.path.join(_path, sample, 'text.txt'), 'r').read()
screenshot_file = os.path.join(_path, sample, 'screenshot.jpg')
with open(record_file, 'a') as f:
f.write(f"raw materials: {sample}\n\n")
asyncio.run(main(link_dict, text, screenshot_file, record_file, prompts))
"""
with open(screenshot_file, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
print(f"run {model} testing...")
start_time = time.time()
hallucination_times = 0
# get infos from image
_infos = []
llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_info_system}]},
{"role": "user", "content": [{"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"}},
{"type": "text", "text": image_info_suffix}]}],
model=model,
temperature=0.1)
print(f"vl model output: \n{llm_output}")
if not llm_output:
hallucination_times += 1
result = []
else:
result = json_repair.repair_json(llm_output, return_objects=True)
if not isinstance(result, list):
hallucination_times += 1
result = []
if not result:
hallucination_times += 1
_infos.extend(result)
final_infos = []
for item in _infos:
if 'focus' not in item or 'content' not in item:
hallucination_times += 1
continue
if not item['content']:
hallucination_times += 1
continue
if item['content'] in link_dict:
continue
judge = await llm([{'role': 'system', 'content': verified_system},
{'role': 'user',
'content': f'<info>\n{item["content"]}\n</info>\n\n<text>\n{text}\n</text>\n\n{verified_suffix}'}],
model="THUDM/glm-4-9b-chat", temperature=0.1)
if not judge:
print('scondary model cannot judge')
final_infos.append(item)
continue
to_save = False
for i in range(min(7, len(judge))):
char = judge[-1 - i]
if char == '':
to_save = True
break
elif char == '':
break
if not to_save:
hallucination_times += 1
continue
final_infos.append(item)
print(f"final infos from image: {final_infos}")
print(f"image hallucination times: {hallucination_times}")
print(f"time used: {time.time() - start_time}")
# get links from image
more_links = set()
llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_link_system}]},
{"role": "user", "content": [{"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"}},
{"type": "text", "text": image_link_suffix}]}],
model=model,
temperature=0.1)
print(f"vl model output: \n{llm_output}")
result = re.findall(r'\"\"\"(.*?)\"\"\"', llm_output, re.DOTALL)
if result:
result = result[0].strip()
else:
hallucination_times += 1
result = []
more_links = [link_dict[_t] for _t in result if _t in link_dict]
print(f"more urls by image: {more_links}")
print(f"image hallucination times: {hallucination_times}")
print(f"time used: {time.time() - start_time}")
# get author and publish date from image
llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_ap_system}]},
{"role": "user", "content": [{"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"}},
{"type": "text", "text": image_ap_suffix}]}],
model=model,
max_tokens=50, temperature=0.1)
print(f"vl model output: \n{llm_output}")
if not llm_output:
hallucination_times += 1
ap = {}
else:
result = json_repair.repair_json(llm_output, return_objects=True)
if not isinstance(result, dict):
hallucination_times += 1
ap = {}
else:
ap = result
print(f"ap from image: {ap}")
print(f"image hallucination times: {hallucination_times}")
total_analysis_time = time.time() - start_time
print(f"image analysis finished, total time used: {total_analysis_time}")
"""
asyncio.run(main(link_dict, text, record_file, prompts))

48
test/openai_wrapper.py Normal file
View File

@ -0,0 +1,48 @@
import os
from openai import OpenAI
from openai import RateLimitError
import asyncio
base_url = os.environ.get('LLM_API_BASE', "")
token = os.environ.get('LLM_API_KEY', "")
if not base_url and not token:
raise ValueError("LLM_API_BASE or LLM_API_KEY must be set")
elif base_url and not token:
client = OpenAI(base_url=base_url, api_key="not_use")
elif not base_url and token:
client = OpenAI(api_key=token)
else:
client = OpenAI(api_key=token, base_url=base_url)
llm_lock = asyncio.Lock()
async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str:
if logger:
logger.debug(f'messages:\n {messages}')
logger.debug(f'model: {model}')
logger.debug(f'kwargs:\n {kwargs}')
async with llm_lock:
try:
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
except RateLimitError as e:
logger.warning(f'{e}\nRetrying in 60 second...')
await asyncio.sleep(60)
response = client.chat.completions.create(messages=messages, model=model, **kwargs)
if response.status_code == 200 and response.choices:
return response.choices[0].message.content
else:
logger.error(f'after many try, llm error: {response}')
return ""
except Exception as e:
if logger:
logger.error(f'openai_llm error: {e}')
return ''
if logger:
logger.debug(f'result:\n {response.choices[0]}')
logger.debug(f'usage:\n {response.usage}')
return response.choices[0].message.content

View File

@ -1,30 +1,40 @@
text_info_system = '''作为信息提取助手,你的任务是从给定的网页文本中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下:
text_info_system = '''作为信息提取助手,你的任务是从给定的网页文本中抽取任何与下列关注点之一相关的信息。关注点列表及其解释如下:
{focus_statement}\n
在进行信息提取时请遵循以下原则
- 理解每个兴趣点的含义确保提取的内容与之相关
- 如果兴趣点有进一步的解释确保提取的内容符合这些解释的范围
- 忠于原文你的任务是从网页文本中识别和提取与各个兴趣点相关的信息并不是总结和提炼
- 理解每个关注点的含义确保提取的内容至少与其中之一相关
- 如果关注点有进一步的解释确保提取的内容符合这些解释的范围
- 忠于原文你的任务是从网页文本中抽取相关信息而不是提炼总结和改写
- 对于最终输出的信息请保证主体时间地点等关键要素的清晰明确为此可能需要综合上下文进行提取
- 如果提取的内容中包括类似<mp4>[url1]这样的片段务必保留'''
另外请注意给定的网页文本是通过爬虫程序从html代码中提取出来的所以请忽略里面不必要的空格换行符等'''
text_info_suffix = '''请先复述一遍关注点及其解释,再对原文逐行进行分析。
如果网页文本中包含关注点相关的内容请按照以下json格式输出提取的信息
{"focus": 关注点名称, "content": 提取的内容}
text_info_suffix = '''如果上述网页文本中包含兴趣点相关的内容请按照以下json格式输出提取的信息文本中可能包含多条有用信息请不要遗漏
[{"focus": 兴趣点名称, "content": 提取的内容}]
如果有多条相关信息请按一行一条的格式输出最终输出的结果整体用三引号包裹三引号内不要有其他内容如下是输出格式示例
"""
{"focus": 关注点1名称, "content": 提取内容1}
{"focus": 关注点2名称, "content": 提取内容2}
...
"""
示例
[{"focus": "旅游景点", "content": "北京故宫地址北京市东城区景山前街4号开放时间8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
如果网页文本中不包含任何相关的信息请保证三引号内为空'''
如果网页文本中不包含任何与兴趣点相关的信息请仅输出[]'''
text_link_system = '''你将被给到一段处理过的网页文本在这些文本中所有的url链接都已经被替换为类似"[url120]"这样的标签,并置于与其关联的文本后面。
你的任务是从网页文本中抽取任何与下列关注点之一相关的文本片段关注点列表及其解释如下
text_link_system = '''作为一位高效的信息筛选助手,你将被给到一组链接对应的文本,请从中挑选出跟兴趣点有关的文本。兴趣点及其解释如下:\n\n{focus_statement}\n
在进行信息提取时请遵循以下原则
{focus_statement}\n
在进行取时请遵循以下原则
- 理解每个兴趣点的含义确保提取的文本与之相关
- 如果兴趣点有进一步的解释确保提取的文本符合这些解释的范围'''
- 理解每个关注点的含义确保提取的内容至少与其中之一相关
- 如果关注点有进一步的解释确保提取的内容符合这些解释的范围
- 只抽取以标签类似"[url120]"这样结尾的文本片段
- 维持抽取出的文本片段的原样尤其不要遗漏其后的标签'''
text_link_suffix = '''请一步步思考,最终将挑选出的文本按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
text_link_suffix = '''先复述一遍关注点及其解释,再对原文逐行进行抽取,最终将挑选出的文本片段按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
"""
文本1
文本2
@ -57,7 +67,7 @@ image_info_suffix = '''如果网页截屏中包含兴趣点相关的内容,请
示例
[{"focus": "旅游景点", "content": "北京故宫地址北京市东城区景山前街4号开放时间8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}]
如果截屏中不包含任何与兴趣点相关的信息请仅输出[]'''
如果截屏中不包含任何与兴趣点相关的信息或者你判断这是一个文章列表页面请仅输出[]'''
image_link_system = "作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的网页截屏中挑选出最值得关注的链接推荐给用户进一步点击查看。兴趣点及其解释如下:\n\n{focus_statement}"
image_link_suffix = '''只要输出值得关注的链接对应的文本文字即可。按一行一条的格式输出,最终输出的列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
@ -71,3 +81,5 @@ image_ap_system = "As an information extraction assistant, your task is to accur
image_ap_suffix = '''Please output the extracted information in the following JSON format:
{"source": source or article author (use "NA" if this information cannot be found), "publish_date": publication date (keep only the year, month, and day; use "NA" if this information cannot be found)}'''
image_system = "提取图片中的所有文字如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等则输出NA。注意请仅输出提取出的文字不要输出别的任何内容。"
image_system_en = "Extract all text from the image. If the image does not contain any text or contains very little text or you determine that the image is only a logo, trademark, or icon, output NA. Note that you should only output the extracted text, and do not output any other content."

43
test/vl_pic_test.py Normal file
View File

@ -0,0 +1,43 @@
import asyncio
import time
from prompts import image_system, image_system_en
from openai_wrapper import openai_llm as llm
vl_models = ['Qwen/Qwen2-VL-72B-Instruct', 'OpenGVLab/InternVL2-26B', 'Pro/Qwen/Qwen2-VL-7B-Instruct', 'Pro/OpenGVLab/InternVL2-8B', 'deepseek-ai/deepseek-vl2']
pic_url_test_list = ["http://wx.qlogo.cn/mmhead/Q3auHgzwzM55VjAUib4ibtDJzRJYl2Cn7gptSxwhmyyvdBwkS9SwUQtQ/0",
"http://mmbiz.qpic.cn/mmbiz_png/Oe1ibnzkdE2PQc84CVcyiaW9Cw7KssCq2dGXrHsRxscWHySXrTkaLBJ5Jw7ztaRE9d3l5yayXfDAYmDXRFuqyLAA/0?wx_fmt=png",
"http://mmbiz.qpic.cn/mmbiz_jpg/DhKPeHFI5HhgEgSGl8CMdNgo3dovxjhnCKLukmF18OtpHDE9IcwlyNT0xTQ28oFrfa4tDW4yQSicOpFY3SNCd5w/0?wx_fmt=jpeg",
"http://mmbiz.qpic.cn/mmbiz_png/CM7KBM0HLAiaj8f0bEAIa9EfPtI8Kd374zjaiaRTiaz8z2CMyJZDtnaAekuK4bEBllicqiclPUh87SeeAcfEvpUWgYA/0?wx_fmt=png",
"http://wx.qlogo.cn/mmhead/Q3auHgzwzM4Rq0U14VV5UicYPnWw8I9eZ8g6TJ2ltAROQcBxbsxwVKg/0",
"http://mmbiz.qpic.cn/sz_mmbiz_png/Bcdib1U6AjwVmSic6l8qbibZfvensdLfcjmNlpz8wjm3cgwJibwXaAgzuGU7vYXDnsJ3fbgOUFHtNQH4iaBGBm43iccg/0?wx_fmt=png",
"https://mmbiz.qpic.cn/mmbiz_png/fRg3eJSOt2ur70INyK0A4etnkPmZnicOhKcT07w4keGiahyh7RbMgwATwNTUxjVypeKkd6C9syHmwE1WFIrXedcg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=png&amp;from=appmsg",
"https://img.36krcdn.com/hsossms/20241221/v2_40c28bcceafc4905b8612d6dce7a6a2a@000000_oswg116731oswg1280oswg545_img_000?x-oss-process=image/resize,m_mfit,w_600,h_400,limit_0/crop,w_600,h_400,g_center",
"http://mmbiz.qpic.cn/mmbiz_png/K85bvE9rzFOgDvibAsz4S0sZqv4O8spfH2mhvOMWicLDRMib7xiaWTMhGnAmXK7qoxQafrSw4XH0r88XbJ6aVAydqw/300?wx_fmt=png",
"https://bootcdn.xuexi.cn/18600410326/bd19863611993ad460d1c23fa910fc00.png",
"https://bootcdn.xuexi.cn/18600410326/69830c9e173b5374aa9b6de43a912e4d.png",
"https://bootcdn.xuexi.cn/18600410326/0458c43bba70d60ca77d6f158835dd6c.png",
"https://bootcdn.xuexi.cn/18600410326/1398b93f1f4273536e56e8899ad46d17.png",
"https://bootcdn.xuexi.cn/18600410326/963274d57bd3c27e3c262984887c9e48.png",
]
async def extract_text_from_url(url):
for model in vl_models:
print(f"running {model} ...\n")
start_time = time.time()
llm_output = await llm([{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
{"type": "text", "text": image_system}]}], model=model)
print(f"cn prompt output: \n{llm_output}\n")
print(f"time spent: {time.time() - start_time}\n")
start_time = time.time()
llm_output = await llm([{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
{"type": "text", "text": image_system_en}]}], model=model)
print(f"en prompt output: \n{llm_output}\n")
print(f"time spent: {time.time() - start_time}\n")
if __name__ == '__main__':
for url in pic_url_test_list:
print(f"testing {url} ...\n")
asyncio.run(extract_text_from_url(url))