diff --git a/core/agents/get_info.py b/core/agents/get_info.py index 609d6b7..96a87fe 100644 --- a/core/agents/get_info.py +++ b/core/agents/get_info.py @@ -117,7 +117,6 @@ url2 return result['source'], extract_and_convert_dates(result['publish_date']) async def get_more_related_urls(self, link_dict: dict, og_url: str) -> set[str]: - """ if not link_dict: return set() self.logger.debug(f'{len(link_dict)} items to analyze') @@ -155,8 +154,6 @@ url2 self.logger.warning(f"{hallucination_urls} not in link_dict, it's model's Hallucination") return urls & raw_urls - """ - return set() async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]: if not text: diff --git a/env_sample b/env_sample index 23313d8..40c6801 100755 --- a/env_sample +++ b/env_sample @@ -5,6 +5,7 @@ export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and ##belowing is optional, go as you need #export VERBOSE="true" ##for detail log info. If not need, remove this item. #export PRIMARY_MODEL="Qwen/Qwen2.5-14B-Instruct" -#export SECONDARY_MODEL="THUDM/glm-4-9b-chat" +#export SECONDARY_MODEL="Qwen/Qwen2.5-7B-Instruct" +#export VL_MODEL="OpenGVLab/InternVL2-26B" export PROJECT_DIR="work_dir" #export PB_API_BASE="" ##only use if your pb not run on 127.0.0.1:8090 \ No newline at end of file diff --git a/pb/README.md b/pb/README.md index 4cdf40c..0e84fc8 100755 --- a/pb/README.md +++ b/pb/README.md @@ -4,6 +4,6 @@ download https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/ cd pb xattr -d com.apple.quarantine pocketbase # for Macos ./pocketbase migrate up # for first run -./pocketbase --dev admin create test@example.com 1234567890 # If you don't have an initial account, please use this command to create it +./pocketbase --dev superuser create "test@example.com" "1234567890" # If you don't have an initial account, please use this command to create it ./pocketbase serve -``` \ No newline at end of file +``` diff --git a/test/README.md b/test/README.md index c9ee143..4e2166b 100644 --- a/test/README.md +++ b/test/README.md @@ -1 +1,12 @@ -![alt text](image.png) \ No newline at end of file +| 模型 | 提示语言 | 漏字 | 不遵守指令 | 识别错误 | 幻觉 | 总分 | 评价 | +|------|----------|------|------------|----------|------|------|------| +| Qwen/Qwen2-VL-72B-Instruct | cn prompt | 2 | 1 | 3 | 0 | 6 | | +| | en prompt | 2 | 1 | 1 | 0 | 4 | 👍 | +| OpenGVLab/InternVL2-26B | cn prompt | 1 | 0 | 2 | 0 | 3 | 👍👍 | +| | en prompt | 0 | 2 | 3 | 0 | 5 | | +| Pro/Qwen/Qwen2-VL-7B-Instruct | cn prompt | 1 | 1 | 2 | 1 | 5 | | +| | en prompt | 0 | 2 | 3 | 0 | 5 | | +| Pro/OpenGVLab/InternVL2-8B | cn prompt | 3 | 2 | 2 | 0 | 7 | | +| | en prompt | 2 | 2 | 4 | 1 | 9 | | +| deepseek-ai/deepseek-vl2 | cn prompt | 1 | 1 | 1 | 1 | 4 | 👍 | +| | en prompt | 3 | 0 | 1 | 4 | 8 | | \ No newline at end of file diff --git a/test/fetching_for_sample.py b/test/fetching_for_sample.py index 71442ae..04cbdec 100644 --- a/test/fetching_for_sample.py +++ b/test/fetching_for_sample.py @@ -9,10 +9,10 @@ from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingCont from datetime import timedelta -sites = ["https://cryptopanic.com/news/"] +sites = ["https://www.gd121.cn/zx/qxzx/list.shtml", +] - -os.environ['CRAWLEE_STORAGE_DIR'] = 'test/webpage_samples/crawlee_storage' +os.environ['CRAWLEE_STORAGE_DIR'] = 'webpage_samples/crawlee_storage' save_dir = 'webpage_samples' async def main(sites: list): @@ -25,80 +25,340 @@ async def main(sites: list): @crawler.pre_navigation_hook async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None: - context.log.info(f'navigeting {context.request.url} ...') + context.log.info(f'navigating {context.request.url} ...') @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: - # context.log.info(f'Processing {context.request.url} ...') - # Handle dialogs (alerts, confirms, prompts) await context.page.wait_for_load_state('networkidle') await context.page.wait_for_timeout(2000) - + + # Handle dialogs (alerts, confirms, prompts) async def handle_dialog(dialog): context.log.info(f'Closing dialog: {dialog.message}') await dialog.accept() context.page.on('dialog', handle_dialog) - - # 尝试查找并点击 "Accept" 按钮 - button_texts = ['Accept', 'Allow', 'Close'] - button_selectors = ['.close-btn', '.accept-button', '.allow-button'] - - # 等待弹窗出现并尝试关闭 - for text in button_texts: - try: - context.log.info(f'等待按钮: {text} 可见...') - await context.page.wait_for_selector(f'button:text("{text}")', state='visible', timeout=5000) # 等待最多5秒 - await context.page.locator(f'button:text("{text}")').click() - context.log.info(f'点击按钮: {text}') - await context.page.wait_for_timeout(1000) - except Exception as e: - context.log.error(f'未能点击按钮: {text},错误: {e}') - - for selector in button_selectors: - try: - context.log.info(f'等待选择器: {selector} 可见...') - await context.page.wait_for_selector(selector, state='visible', timeout=5000) # 等待最多5秒 - await context.page.locator(selector).click() - context.log.info(f'点击选择器: {selector}') - await context.page.wait_for_timeout(1000) - except Exception as e: - context.log.error(f'未能点击选择器: {selector},错误: {e}') + + context.log.info('successfully finish fetching') folder = os.path.join(save_dir, f"{hashlib.sha256(context.request.url.encode()).hexdigest()[-6:]}") os.makedirs(folder, exist_ok=True) + + html = await context.page.inner_html('head') + soup = BeautifulSoup(html, 'html.parser') + web_title = soup.find('title') + if web_title: + web_title = web_title.get_text().strip() + else: + web_title = '' + + base_tag = soup.find('base', href=True) + if base_tag and base_tag.get('href'): + base_url = base_tag['href'] + else: + # if no base tag, use the current url as base url + parsed_url = urlparse(context.request.url) + domain = parsed_url.netloc + base_url = f"{parsed_url.scheme}://{domain}" + html = await context.page.inner_html('body') - context.log.info('successfully finish fetching') - existing_urls = set() - parsed_url = urlparse(context.request.url) - domain = parsed_url.netloc - text = await context.page.inner_text('body') - with open(os.path.join(folder, 'text.txt'), 'w') as f: - f.write(text) + + # to use a customer scaper here soup = BeautifulSoup(html, 'html.parser') - links = soup.find_all('a', href=True) - base_url = f"{parsed_url.scheme}://{domain}" + + # 移除导航、页眉、页脚等通用元素 + for selector in ['div#nav', 'div.header', 'div#footer', 'nav', 'header', 'footer']: + elements = soup.select(selector) + for element in elements: + element.decompose() + + action_dict = {} + for form in soup.find_all('form', recursive=True): + form_dict = {} + for input_elem in form.find_all('input'): + input_type = input_elem.get('type', 'text') + input_name = input_elem.get('name', f'input_{len(action_dict)}') + input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']]) + input_dict = { + "type": input_type, + "values": [input_value] if input_value else [] + } + + # handle datalist + if input_elem.get('list'): + datalist = soup.find('datalist', id=input_elem['list']) + if datalist: + options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')] + input_dict = { + "type": "text", + "values": [f"one of followings: {options}"] + } + + form_dict[input_name] = input_dict + + for select in form.find_all('select'): + select_name = select.get('name', f'select_{len(form_dict)}') + options = [opt.get('value', opt.text.strip()) for opt in select.find_all('option')] + form_dict[select_name] = { + "type": "select", + "values": options + } + + for textarea in form.find_all('textarea'): + textarea_name = textarea.get('name', f'textarea_{len(form_dict)}') + form_dict[textarea_name] = { + "type": "textarea", + "values": [textarea.text.strip()] + } + + if form_dict: + form_id = form.get('id', f'form_{len(action_dict)}') + action_dict[form_id] = form_dict + + form.decompose() + + # handle input elements that are not in any form + for input_elem in soup.find_all('input', recursive=True): + if input_elem.find_parent('form') is None: + # check if the input is associated with a form by form attribute + form_ids = input_elem.get('form', '').split() + + # handle input element + input_type = input_elem.get('type', 'text') + input_name = input_elem.get('name', f'input_{len(action_dict)}') + input_value = ' '.join([f"{k}={v}" for k, v in input_elem.attrs.items() if k not in ['type', 'name', 'form']]) + input_dict = { + "type": input_type, + "values": [input_value] if input_value else [] + } + + # handle datalist + if input_elem.get('list'): + datalist = soup.find('datalist', id=input_elem['list']) + if datalist: + options = [opt.get('value', opt.text.strip()) for opt in datalist.find_all('option')] + input_dict = { + "type": "text", + "values": [f"one of followings: {options}"] + } + # decide the placement of the input element based on form attribute + if form_ids: + for form_id in form_ids: + if form_id in action_dict: + action_dict[form_id][input_name] = input_dict + else: + action_dict[form_id] = {input_name: input_dict} + else: + action_dict[input_name] = {"input": input_dict} + + input_elem.decompose() + + for button in soup.find_all(['button', 'input[type="button"]', 'input[type="submit"]'], recursive=True): + button_name = button.get('name', '') or button.get('id', '') or button.text.strip() + if not button_name: + button_name = f'button_{len(action_dict)}' + + button_type = button.get('type', 'button') + button_value = button.get('value', button.text.strip()) + + action_dict[button_name] = { + "button": { + "type": button_type, + "values": [button_value] if button_value else [] + } + } + + button.decompose() + + for command in soup.find_all('command', recursive=True): + command_name = command.get('name', '') or command.get('id', '') or command.text.strip() + if not command_name: + command_name = f'command_{len(action_dict)}' + + command_type = command.get('type', 'command') + command_value = command.get('value', command.text.strip()) + + action_dict[command_name] = { + "command": { + "type": command_type, + "values": [command_value] if command_value else [] + } + } + + command.decompose() + link_dict = {} - for a in links: - new_url = a.get('href') - if new_url.startswith('javascript:') or new_url.startswith('#') or new_url.startswith('mailto:'): - continue - if new_url in [context.request.url, base_url]: - continue - if new_url in existing_urls: - continue - t = a.text.strip() - if new_url and t: - link_dict[t] = urljoin(base_url, new_url) - existing_urls.add(new_url) + for img in soup.find_all('img', src=True, recursive=True): + src = img.get('src') + if src.startswith('#') or src.startswith('about:blank'): + src = None + text = img.get('alt', '').strip() + if src: + if not src.startswith(('http://', 'https://')): + src = urljoin(base_url, src) + key = f"url{len(link_dict)}" + link_dict[key] = src + text = f"{text}[{key}]" + + # find all area urls related to this img + area_urls = set() + if img.get('usemap'): + # remove the # at the beginning of the map name + map_name = img.get('usemap').lstrip('#') + # find the map tag + map_tag = soup.find('map', {'name': map_name}) + if map_tag: + # get all area tags under the map + for area in map_tag.find_all('area', href=True): + area_href = area.get('href') + if area_href.startswith('javascript:') or area_href.startswith('#') or area_href.startswith('mailto:') or area_href.startswith('data:') or area_href.startswith('about:blank'): + area_href = None + if area_href: + if not area_href.startswith(('http://', 'https://')): + area_href = urljoin(base_url, area_href) + area_urls.add(area_href) + area.decompose() + # delete the whole map tag + map_tag.decompose() + for area_url in area_urls: + if area_url in [context.request.url, base_url]: + continue + key = f"url{len(link_dict)}" + link_dict[key] = area_url + text = f"{text}[{key}]" + + img.replace_with(f"-{text}") + + for media in soup.find_all(['video', 'audio', 'source', 'embed', 'iframe', 'figure'], src=True, recursive=True): + src = media.get('src') + if src.startswith('javascript:') or src.startswith('#') or src.startswith('mailto:') or src.startswith('data:') or src.startswith('about:blank'): + src = None + text = media.get('alt', '').strip() or media.get_text().strip() + if src: + # convert relative path to full url + if not src.startswith(('http://', 'https://')): + src = urljoin(context.request.url, src) + key = f"url{len(link_dict)}" + link_dict[key] = src + ext = os.path.splitext(src)[1].lstrip('.') or media.name + text = f"{text}<{ext}>[{key}]" + + media.replace_with(f"-{text}") + + for obj in soup.find_all('object', data=True, recursive=True): + data = obj.get('data') + if data.startswith('javascript:') or data.startswith('#') or data.startswith('mailto:') or data.startswith('data:') or data.startswith('about:blank'): + data = None + text = obj.get('title', '').strip() or obj.get_text().strip() + if data: + # convert relative path to full url + if not data.startswith(('http://', 'https://')): + data = urljoin(context.request.url, data) + key = f"url{len(link_dict)}" + link_dict[key] = data + ext = os.path.splitext(data)[1].lstrip('.') or 'object' + text = f"{text}<{ext}>[{key}]" + + obj.replace_with(f"-{text}") + + # process links at last, so that we can keep the image and media info in the link + for a in soup.find_all('a', href=True, recursive=True): + href = a.get('href') + if href.startswith('javascript:') or href.startswith('#') or href.startswith('mailto:') or href.startswith('data:') or href.startswith('about:blank'): + href = None + if href: + text = a.get_text().strip() or '-' + if not href.startswith(('http://', 'https://')): + href = urljoin(context.request.url, href) + if href in [context.request.url, base_url]: + continue + key = f"url{len(link_dict)}" + link_dict[key] = href + a.replace_with(f"{text}[{key}]") + + # handle headings + for i in range(1, 7): # h1 到 h6 + for heading in soup.find_all(f'h{i}', recursive=False): + text = heading.get_text().strip() + heading.replace_with(f"{'#' * i} {text}\n") + + # replace all
and
tags with newlines + for br in soup.find_all(['br', 'br/', 'br /', 'hr', 'hr/', 'hr /', 'wbr'], recursive=True): + br.replace_with('\n') + + # handle lists + for list_tag in soup.find_all(['ul', 'ol'], recursive=True): + list_text = [] + for idx, item in enumerate(list_tag.find_all('li')): + list_text.append(f"{idx + 1}. {item.get_text().strip()}") + list_text = '\t'.join(list_text) + list_tag.replace_with(f"{list_text}\n") + + # handle spans - merge span text with surrounding text + for span in soup.find_all('span', recursive=True): + span.replace_with(span.get_text().strip()) + + # handle strikethrough text + for del_tag in soup.find_all(['del', 's'], recursive=True): + del_text = del_tag.get_text().strip() + if del_text: + del_tag.replace_with(f"{del_text}(maybe_outdated)") + else: + del_tag.decompose() + + # handle tables + for table in soup.find_all('table', recursive=True): + table_text = [] + + # handle caption + caption = table.find('caption') + if caption: + table_text.append(caption.get_text().strip()) + + # get headers + headers = [] + for th in table.find_all('th'): + headers.append(th.get_text().strip()) + + # handle all rows (including tbody and tfoot) + for row in table.find_all('tr'): + # get the first cell value + # try to find th as first_val + first_cell = row.find(['th', 'td']) + if not first_cell: + continue + first_val = first_cell.get_text().strip() + cells = row.find_all('td') + if not cells: + continue + + # handle remaining cells + for idx, cell in enumerate(cells): + cell_text = cell.get_text().strip() + if not cell_text or cell_text == first_val: + continue + + header_text = headers[idx] if idx < len(headers) else '' + cell_str = f"{first_val}-{header_text}-{cell_text}" + table_text.append(cell_str) + + # replace the table with the processed text + table_text = '\n'.join(table_text) + table.replace_with(f"\n{table_text}\n") + + html_text = soup.get_text(strip=False, separator='\n') + + with open(os.path.join(folder, 'text.txt'), 'w') as f: + f.write(html_text) + with open(os.path.join(folder, 'link_dict.json'), 'w', encoding='utf-8') as f: json.dump(link_dict, f, indent=4, ensure_ascii=False) + + with open(os.path.join(folder, 'action_dict.json'), 'w', encoding='utf-8') as f: + json.dump(action_dict, f, indent=4, ensure_ascii=False) - links_number_from_html = len(link_dict) - print(f"links number from html: {links_number_from_html}") - - screenshot_file = os.path.join(folder, 'screenshot.jpg') - await context.page.screenshot(path=screenshot_file, full_page=True) + # screenshot_file = os.path.join(folder, 'screenshot.jpg') + # await context.page.screenshot(path=screenshot_file, full_page=True) await crawler.run(sites) diff --git a/test/find_article_or_list.py b/test/find_article_or_list.py new file mode 100644 index 0000000..047c3e6 --- /dev/null +++ b/test/find_article_or_list.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +import os, re +import json +import time + + +sample_dir = 'webpage_samples' +list_judge_threshold = 0.007 +valid_list_min_length = 10 +min_content_length = 420 + +common_file_exts = [ + 'jpg', 'jpeg', 'png', 'gif', 'pdf', 'doc', 'docx', 'svg', 'm3u8', + 'mp4', 'mp3', 'wav', 'avi', 'mov', 'wmv', 'flv', 'webp', 'webm', + 'zip', 'rar', '7z', 'tar', 'gz', 'bz2', + 'txt', 'csv', 'xls', 'xlsx', 'ppt', 'pptx', + 'json', 'xml', 'yaml', 'yml', 'css', 'js', 'php', 'asp', 'jsp' +] +common_tlds = [ + '.com', '.cn', '.net', '.org', '.edu', '.gov', '.io', '.co', + '.info', '.biz', '.me', '.tv', '.cc', '.xyz', '.app', '.dev', + '.cloud', '.ai', '.tech', '.online', '.store', '.shop', '.site', + '.top', '.vip', '.pro', '.ltd', '.group', '.team', '.work' +] + +def find_article_or_list(link_dict, text) -> (bool, bool, str): + lines = [l.strip() for l in text.split('\n') if l.strip()] + text = '\n'.join(lines) + text_no_tags = re.sub(r'<\w{1,5}>', '', text) + text_no_urls = re.sub(r'\[url\d+]', '', text_no_tags) + content_length = len(text_no_urls) + + valid_url_count = 0 + for url in link_dict.values(): + url_lower = url.lower() + has_common_ext = any(url_lower.endswith(ext) for ext in common_file_exts) + has_common_tld = any(url_lower.endswith(tld) for tld in common_tlds) + if not has_common_ext and not has_common_tld: + valid_url_count += 1 + + valid_url_rate = valid_url_count / content_length + is_list = valid_url_rate > 0.007 and valid_url_count > valid_list_min_length + need_more_info = content_length < min_content_length + return is_list, need_more_info, text + + +if __name__ == '__main__': + dirs = os.listdir(sample_dir) + for _dir in dirs: + if not _dir.startswith('task'): + continue + _path = os.path.join(sample_dir, _dir) + if not os.path.isdir(_path): + continue + + samples = os.listdir(_path) + time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + record_file = os.path.join(_path, f'article_or_list_judge.txt') + for sample in samples: + if not os.path.isdir(os.path.join(_path, sample)): + continue + files = os.listdir(os.path.join(_path, sample)) + if 'link_dict.json' not in files or 'text.txt' not in files: + print(f'{sample} files not complete, skip') + continue + link_dict = json.load(open(os.path.join(_path, sample, 'link_dict.json'), 'r')) + text = open(os.path.join(_path, sample, 'text.txt'), 'r').read() + is_list, need_more_info, text = find_article_or_list(link_dict, text) + with open(record_file, 'a') as f: + f.write(f"raw materials: {sample}\n\n") + f.write(f"cleaned text: \n{text}\n\n") + f.write("list\n" if is_list else "article\n") + f.write("need more info\n" if need_more_info else "no need more info\n") + f.write("*" * 12) + f.write('\n\n') diff --git a/test/get_info_test.py b/test/get_info_test.py index 3423023..d95e23c 100644 --- a/test/get_info_test.py +++ b/test/get_info_test.py @@ -3,113 +3,140 @@ import os, re import json import asyncio -import time, base64 -from info_test_prompts import * +import time +from prompts import * import json_repair -from llms.openai_wrapper import openai_llm as llm +from openai_wrapper import openai_llm as llm +from find_article_or_list import find_article_or_list, common_tlds, common_file_exts - -sample_dir = 'test/webpage_samples' +sample_dir = 'webpage_samples' models = ['deepseek-ai/DeepSeek-V2.5', 'Qwen/Qwen2.5-Coder-32B-Instruct', 'Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen2.5-14B-Instruct', 'Qwen/Qwen2.5-Coder-7B-Instruct'] -vl_models = ['Qwen/Qwen2-VL-72B-Instruct', 'OpenGVLab/InternVL2-26B', 'TeleAI/TeleMM', 'Pro/Qwen/Qwen2-VL-7B-Instruct', 'Pro/OpenGVLab/InternVL2-8B', 'OpenGVLab/InternVL2-Llama3-76B'] +secondary_mpdel = 'Qwen/Qwen2.5-7B-Instruct' +vl_model = '' + +async def generate_results(text, model, system_prompt, suffix_prompt) -> set: + lines = text.split('\n') + cache = set() + text_batch = '' + for line in lines: + text_batch = f'{text_batch}\n{line}' + if len(text_batch) > 1024: + content = f'\n{text_batch}\n\n\n{suffix_prompt}' + result = await llm( + [{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}], + model=model, temperature=0.1) + print(f"llm output: {result}") + result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) + if not result: + print(f"warning: bad generate result") + text_batch = '' + continue + result = result[0].strip() + result = result.split('\n') + cache.update(result) + text_batch = '' + + if text_batch: + content = f'\n{text_batch}\n\n\n{suffix_prompt}' + result = await llm( + [{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}], + model=model, temperature=0.1) + print(f"llm output: {result}") + result = re.findall(r'\"\"\"(.*?)\"\"\"', result, re.DOTALL) + if not result: + print(f"warning: bad generate result") + return cache + result = result[0].strip() + result = result.split('\n') + cache.update(result) + return cache + + +async def extract_info_from_img(text, link_dict) -> str: + cache = {} + pattern = r'\[url\d+\]' + matches = re.findall(pattern, text) + for match in matches: + key = match.split('[url')[1][:-1] + url = link_dict.get(f'url{key}', '') + if not url: + continue + + if url in cache: + replace_text = cache[url] + else: + if any(url.lower().endswith(tld) for tld in common_tlds): + continue + if any(url.lower().endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): + continue + llm_output = await llm([{"role": "user", + "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}}, + {"type": "text", "text": image_system}]}], model='OpenGVLab/InternVL2-26B') + print(f"vl model output: \n{llm_output}\n") + replace_text = llm_output + cache[url] = replace_text + text = text.replace(match, f'{replace_text}{match}', 1) + return text + + +async def main(link_dict, text, record_file, prompts): + is_list, need_more_info, text = find_article_or_list(link_dict, text) + + if is_list: + print("may be a article list page, get more urls ...") + system_prompt = prompts[1] + suffix_prompt = text_link_suffix + else: + if need_more_info: + print("may be a article page need to get more text from images...") + text = await extract_info_from_img(text, link_dict) + print(f"extended text: \n{text}\n") + + system_prompt = prompts[0] + suffix_prompt = text_info_suffix -async def main(link_dict, text, screenshot_file, record_file, prompts): for model in models: print(f"running {model} ...") start_time = time.time() hallucination_times = 0 - # got more links from text - # more_urls = set() - more_url_text = set() - content = '' - for key in link_dict.keys(): - content = f"{content}{key}\n" - if len(content) > 512: - result = await llm([{'role': 'system', 'content': prompts[1]}, - {'role': 'user', 'content': f'\n{content}\n\n\n{text_link_suffix}'}], - model=model, temperature=0.1) - print(f"llm output: {result}") - result = re.findall(r'"""(.*?)"""', result, re.DOTALL) - if result: - result = result[0].strip() - result = result.split('\n') - # more_urls.update({link_dict[t] for t in result if t in link_dict}) - more_url_text.update({f"{t}: {link_dict[t]}" for t in result if t in link_dict}) - else: - hallucination_times += len(result) - len({t for t in result if t in link_dict}) - content = '' - - if content: - result = await llm([{'role': 'system', 'content': prompts[1]}, - {'role': 'user', 'content': f'\n{content}\n\n\n{text_link_suffix}'}], - model=model, temperature=0.1) - print(f"llm output: {result}") - result = re.findall(r'"""(.*?)"""', result, re.DOTALL) - if result: - result = result[0].strip() - result = result.split('\n') - # more_urls.update({link_dict[t] for t in result if t in link_dict}) - more_url_text.update({f"{t}: {link_dict[t]}" for t in result if t in link_dict}) - else: - hallucination_times += len(result) - len({t for t in result if t in link_dict}) - - more_url_text = '\n'.join(more_url_text) - print(f"time spent: {time.time() - start_time}") - - # get infos from text - infos = [] - lines = text.split('\n') - cache = '' - for line in lines: - cache = f'{cache}{line}' - if len(cache) > 2048: - content = f'\n{cache}\n\n\n{text_info_suffix}' - result = await llm( - [{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}], - model=model, temperature=0.1, response_format={"type": "json_object"}) - print(f"llm output: {result}") - cache = '' - if not result: - hallucination_times += 1 - continue - result = json_repair.repair_json(result, return_objects=True) - if not isinstance(result, list): - hallucination_times += 1 - continue - if not result: - hallucination_times += 1 - continue - infos.extend(result) - - if cache: - content = f'\n{cache}\n\n\n{text_info_suffix}' - result = await llm([{'role': 'system', 'content': prompts[0]}, {'role': 'user', 'content': content}], - model=model, temperature=0.1, response_format={"type": "json_object"}) - print(f"llm output: {result}") - if not result: - hallucination_times += 1 - result = json_repair.repair_json(result, return_objects=True) - if not isinstance(result, list): - hallucination_times += 1 - if not result: - hallucination_times += 1 - infos.extend(result) - final_infos = [] - for item in infos: - if 'focus' not in item or 'content' not in item: - hallucination_times += 1 - continue - if not item['content']: - hallucination_times += 1 - continue - if item['content'] in link_dict: - continue + raw_result = await generate_results(text, model, system_prompt, suffix_prompt) + final_result = set() + for item in raw_result: + if is_list: + if '[url' not in item: + hallucination_times += 1 + continue + # 从item中提取[]中的url标记 + url_tag = re.search(r'\[(.*?)]', item).group(1) + if url_tag not in link_dict: + hallucination_times += 1 + continue + result_url = link_dict[url_tag] + if any(result_url.lower().endswith(tld) for tld in common_tlds): + continue + if any(result_url.lower().endswith(ext) for ext in common_file_exts): + continue + final_result.add(item) + else: + result = json_repair.repair_json(item, return_objects=True) + if not isinstance(result, dict): + hallucination_times += 1 + continue + if not result: + hallucination_times += 1 + continue + if 'focus' not in result or 'content' not in result: + hallucination_times += 1 + continue + if not result['content'].strip() or not result['focus'].strip(): + hallucination_times += 1 + continue + if result['focus'].startswith('#'): + result['focus'] = result['focus'][1:] + final_result.add(result) - final_infos.append(f"{item['focus']}: {item['content']}") - - final_infos = '\n'.join(final_infos) - print(f"time spent: {time.time() - start_time}") + final_infos = '\n'.join(final_result) # get author and publish date from text if len(text) > 1024: @@ -142,7 +169,7 @@ async def main(link_dict, text, screenshot_file, record_file, prompts): f.write(f"total analysis time: {total_analysis_time}\n\n") f.write(f"author and publish time(not formated): {ap_}\n") f.write(f"infos(not formated): \n{final_infos}\n") - f.write(f"more urls: \n{more_url_text}\n\n") + #f.write(f"more urls: \n{more_url_text}\n\n") f.write("*" * 12) f.write('\n\n') @@ -150,7 +177,7 @@ async def main(link_dict, text, screenshot_file, record_file, prompts): if __name__ == '__main__': dirs = os.listdir(sample_dir) for _dir in dirs: - if not _dir.startswith('task'): + if not _dir.startswith('task0'): continue _path = os.path.join(sample_dir, _dir) if not os.path.isdir(_path): @@ -168,11 +195,8 @@ if __name__ == '__main__': focus_statement = f"{focus_statement}解释:{expl}\n" print(f'start testing {_dir}') - print(f"focus statement: {focus_statement}") get_info_system = text_info_system.replace('{focus_statement}', focus_statement) get_link_system = text_link_system.replace('{focus_statement}', focus_statement) - #get_info_system = image_info_system.replace('{focus_statement}', focus_statement) - #get_link_system = image_link_system.replace('{focus_statement}', focus_statement) prompts = [get_info_system, get_link_system] samples = os.listdir(_path) @@ -184,130 +208,11 @@ if __name__ == '__main__': if not os.path.isdir(os.path.join(_path, sample)): continue files = os.listdir(os.path.join(_path, sample)) - if 'link_dict.json' not in files or 'text.txt' not in files or 'screenshot.jpg' not in files: + if 'link_dict.json' not in files or 'text.txt' not in files: print(f'{sample} files not complete, skip') continue link_dict = json.load(open(os.path.join(_path, sample, 'link_dict.json'), 'r')) text = open(os.path.join(_path, sample, 'text.txt'), 'r').read() - screenshot_file = os.path.join(_path, sample, 'screenshot.jpg') with open(record_file, 'a') as f: f.write(f"raw materials: {sample}\n\n") - asyncio.run(main(link_dict, text, screenshot_file, record_file, prompts)) -""" - with open(screenshot_file, "rb") as image_file: - base64_image = base64.b64encode(image_file.read()).decode('utf-8') - print(f"run {model} testing...") - start_time = time.time() - hallucination_times = 0 - - # get infos from image - _infos = [] - llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_info_system}]}, - {"role": "user", "content": [{"type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", - "detail": "high"}}, - {"type": "text", "text": image_info_suffix}]}], - model=model, - temperature=0.1) - - print(f"vl model output: \n{llm_output}") - if not llm_output: - hallucination_times += 1 - result = [] - else: - result = json_repair.repair_json(llm_output, return_objects=True) - if not isinstance(result, list): - hallucination_times += 1 - result = [] - if not result: - hallucination_times += 1 - _infos.extend(result) - - final_infos = [] - for item in _infos: - if 'focus' not in item or 'content' not in item: - hallucination_times += 1 - continue - if not item['content']: - hallucination_times += 1 - continue - - if item['content'] in link_dict: - continue - - judge = await llm([{'role': 'system', 'content': verified_system}, - {'role': 'user', - 'content': f'\n{item["content"]}\n\n\n\n{text}\n\n\n{verified_suffix}'}], - model="THUDM/glm-4-9b-chat", temperature=0.1) - if not judge: - print('scondary model cannot judge') - final_infos.append(item) - continue - - to_save = False - for i in range(min(7, len(judge))): - char = judge[-1 - i] - if char == '是': - to_save = True - break - elif char == '否': - break - if not to_save: - hallucination_times += 1 - continue - final_infos.append(item) - - print(f"final infos from image: {final_infos}") - print(f"image hallucination times: {hallucination_times}") - print(f"time used: {time.time() - start_time}") - - # get links from image - more_links = set() - llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_link_system}]}, - {"role": "user", "content": [{"type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", - "detail": "high"}}, - {"type": "text", "text": image_link_suffix}]}], - model=model, - temperature=0.1) - print(f"vl model output: \n{llm_output}") - result = re.findall(r'\"\"\"(.*?)\"\"\"', llm_output, re.DOTALL) - if result: - result = result[0].strip() - else: - hallucination_times += 1 - result = [] - - more_links = [link_dict[_t] for _t in result if _t in link_dict] - print(f"more urls by image: {more_links}") - print(f"image hallucination times: {hallucination_times}") - print(f"time used: {time.time() - start_time}") - - # get author and publish date from image - llm_output = await llm([{"role": "system", "content": [{"type": "text", "text": image_ap_system}]}, - {"role": "user", "content": [{"type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_image}", - "detail": "high"}}, - {"type": "text", "text": image_ap_suffix}]}], - model=model, - max_tokens=50, temperature=0.1) - print(f"vl model output: \n{llm_output}") - if not llm_output: - hallucination_times += 1 - ap = {} - else: - result = json_repair.repair_json(llm_output, return_objects=True) - if not isinstance(result, dict): - hallucination_times += 1 - ap = {} - else: - ap = result - - print(f"ap from image: {ap}") - print(f"image hallucination times: {hallucination_times}") - total_analysis_time = time.time() - start_time - print(f"image analysis finished, total time used: {total_analysis_time}") -""" \ No newline at end of file + asyncio.run(main(link_dict, text, record_file, prompts)) diff --git a/test/openai_wrapper.py b/test/openai_wrapper.py new file mode 100644 index 0000000..0457d28 --- /dev/null +++ b/test/openai_wrapper.py @@ -0,0 +1,48 @@ +import os +from openai import OpenAI +from openai import RateLimitError +import asyncio + + +base_url = os.environ.get('LLM_API_BASE', "") +token = os.environ.get('LLM_API_KEY', "") + +if not base_url and not token: + raise ValueError("LLM_API_BASE or LLM_API_KEY must be set") +elif base_url and not token: + client = OpenAI(base_url=base_url, api_key="not_use") +elif not base_url and token: + client = OpenAI(api_key=token) +else: + client = OpenAI(api_key=token, base_url=base_url) + +llm_lock = asyncio.Lock() + +async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: + if logger: + logger.debug(f'messages:\n {messages}') + logger.debug(f'model: {model}') + logger.debug(f'kwargs:\n {kwargs}') + + async with llm_lock: + try: + response = client.chat.completions.create(messages=messages, model=model, **kwargs) + except RateLimitError as e: + logger.warning(f'{e}\nRetrying in 60 second...') + await asyncio.sleep(60) + response = client.chat.completions.create(messages=messages, model=model, **kwargs) + if response.status_code == 200 and response.choices: + return response.choices[0].message.content + else: + logger.error(f'after many try, llm error: {response}') + return "" + except Exception as e: + if logger: + logger.error(f'openai_llm error: {e}') + return '' + + if logger: + logger.debug(f'result:\n {response.choices[0]}') + logger.debug(f'usage:\n {response.usage}') + + return response.choices[0].message.content diff --git a/test/info_test_prompts.py b/test/prompts.py similarity index 54% rename from test/info_test_prompts.py rename to test/prompts.py index 3b67ad0..c118611 100644 --- a/test/info_test_prompts.py +++ b/test/prompts.py @@ -1,30 +1,40 @@ -text_info_system = '''作为信息提取助手,你的任务是从给定的网页文本中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下: +text_info_system = '''作为信息提取助手,你的任务是从给定的网页文本中抽取任何与下列关注点之一相关的信息。关注点列表及其解释如下: {focus_statement}\n 在进行信息提取时,请遵循以下原则: -- 理解每个兴趣点的含义,确保提取的内容与之相关。 -- 如果兴趣点有进一步的解释,确保提取的内容符合这些解释的范围。 -- 忠于原文,你的任务是从网页文本中识别和提取与各个兴趣点相关的信息,并不是总结和提炼。 +- 理解每个关注点的含义,确保提取的内容至少与其中之一相关 +- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围 +- 忠于原文,你的任务是从网页文本中抽取相关信息,而不是提炼、总结和改写 +- 对于最终输出的信息,请保证主体、时间、地点等关键要素的清晰明确,为此可能需要综合上下文进行提取 +- 如果提取的内容中包括类似“”、“[url1]”这样的片段,务必保留''' -另外请注意给定的网页文本是通过爬虫程序从html代码中提取出来的,所以请忽略里面不必要的空格、换行符等。''' +text_info_suffix = '''请先复述一遍关注点及其解释,再对原文逐行进行分析。 +如果网页文本中包含关注点相关的内容,请按照以下json格式输出提取的信息: +{"focus": 关注点名称, "content": 提取的内容} -text_info_suffix = '''如果上述网页文本中包含兴趣点相关的内容,请按照以下json格式输出提取的信息(文本中可能包含多条有用信息,请不要遗漏): -[{"focus": 兴趣点名称, "content": 提取的内容}] +如果有多条相关信息,请按一行一条的格式输出,最终输出的结果整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: +""" +{"focus": 关注点1名称, "content": 提取内容1} +{"focus": 关注点2名称, "content": 提取内容2} +... +""" -示例: -[{"focus": "旅游景点", "content": "北京故宫,地址:北京市东城区景山前街4号,开放时间:8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}] +如果网页文本中不包含任何相关的信息,请保证三引号内为空。''' -如果网页文本中不包含任何与兴趣点相关的信息,请仅输出:[]。''' +text_link_system = '''你将被给到一段处理过的网页文本,在这些文本中所有的url链接都已经被替换为类似"[url120]"这样的标签,并置于与其关联的文本后面。 +你的任务是从网页文本中抽取任何与下列关注点之一相关的文本片段。关注点列表及其解释如下: -text_link_system = '''作为一位高效的信息筛选助手,你将被给到一组链接对应的文本,请从中挑选出跟兴趣点有关的文本。兴趣点及其解释如下:\n\n{focus_statement}\n -在进行信息提取时,请遵循以下原则: +{focus_statement}\n +在进行抽取时,请遵循以下原则: -- 理解每个兴趣点的含义,确保提取的文本与之相关。 -- 如果兴趣点有进一步的解释,确保提取的文本符合这些解释的范围。''' +- 理解每个关注点的含义,确保提取的内容至少与其中之一相关 +- 如果关注点有进一步的解释,确保提取的内容符合这些解释的范围 +- 只抽取以标签(类似"[url120]"这样)结尾的文本片段 +- 维持抽取出的文本片段的原样,尤其不要遗漏其后的标签''' -text_link_suffix = '''请一步步思考,最终将挑选出的文本按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: +text_link_suffix = '''请先复述一遍关注点及其解释,再对原文逐行进行抽取,最终将挑选出的文本片段按一行一条的格式输出,并整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: """ 文本1 文本2 @@ -57,7 +67,7 @@ image_info_suffix = '''如果网页截屏中包含兴趣点相关的内容,请 示例: [{"focus": "旅游景点", "content": "北京故宫,地址:北京市东城区景山前街4号,开放时间:8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}] -如果截屏中不包含任何与兴趣点相关的信息,请仅输出:[]。''' +如果截屏中不包含任何与兴趣点相关的信息或者你判断这是一个文章列表页面,请仅输出:[]。''' image_link_system = "作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的网页截屏中挑选出最值得关注的链接推荐给用户进一步点击查看。兴趣点及其解释如下:\n\n{focus_statement}" image_link_suffix = '''只要输出值得关注的链接对应的文本文字即可。按一行一条的格式输出,最终输出的列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: @@ -71,3 +81,5 @@ image_ap_system = "As an information extraction assistant, your task is to accur image_ap_suffix = '''Please output the extracted information in the following JSON format: {"source": source or article author (use "NA" if this information cannot be found), "publish_date": publication date (keep only the year, month, and day; use "NA" if this information cannot be found)}''' +image_system = "提取图片中的所有文字,如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等,则输出NA。注意请仅输出提取出的文字,不要输出别的任何内容。" +image_system_en = "Extract all text from the image. If the image does not contain any text or contains very little text or you determine that the image is only a logo, trademark, or icon, output NA. Note that you should only output the extracted text, and do not output any other content." diff --git a/test/vl_pic_test.py b/test/vl_pic_test.py new file mode 100644 index 0000000..8b99ed3 --- /dev/null +++ b/test/vl_pic_test.py @@ -0,0 +1,43 @@ +import asyncio +import time +from prompts import image_system, image_system_en +from openai_wrapper import openai_llm as llm + + +vl_models = ['Qwen/Qwen2-VL-72B-Instruct', 'OpenGVLab/InternVL2-26B', 'Pro/Qwen/Qwen2-VL-7B-Instruct', 'Pro/OpenGVLab/InternVL2-8B', 'deepseek-ai/deepseek-vl2'] +pic_url_test_list = ["http://wx.qlogo.cn/mmhead/Q3auHgzwzM55VjAUib4ibtDJzRJYl2Cn7gptSxwhmyyvdBwkS9SwUQtQ/0", +"http://mmbiz.qpic.cn/mmbiz_png/Oe1ibnzkdE2PQc84CVcyiaW9Cw7KssCq2dGXrHsRxscWHySXrTkaLBJ5Jw7ztaRE9d3l5yayXfDAYmDXRFuqyLAA/0?wx_fmt=png", +"http://mmbiz.qpic.cn/mmbiz_jpg/DhKPeHFI5HhgEgSGl8CMdNgo3dovxjhnCKLukmF18OtpHDE9IcwlyNT0xTQ28oFrfa4tDW4yQSicOpFY3SNCd5w/0?wx_fmt=jpeg", +"http://mmbiz.qpic.cn/mmbiz_png/CM7KBM0HLAiaj8f0bEAIa9EfPtI8Kd374zjaiaRTiaz8z2CMyJZDtnaAekuK4bEBllicqiclPUh87SeeAcfEvpUWgYA/0?wx_fmt=png", +"http://wx.qlogo.cn/mmhead/Q3auHgzwzM4Rq0U14VV5UicYPnWw8I9eZ8g6TJ2ltAROQcBxbsxwVKg/0", +"http://mmbiz.qpic.cn/sz_mmbiz_png/Bcdib1U6AjwVmSic6l8qbibZfvensdLfcjmNlpz8wjm3cgwJibwXaAgzuGU7vYXDnsJ3fbgOUFHtNQH4iaBGBm43iccg/0?wx_fmt=png", +"https://mmbiz.qpic.cn/mmbiz_png/fRg3eJSOt2ur70INyK0A4etnkPmZnicOhKcT07w4keGiahyh7RbMgwATwNTUxjVypeKkd6C9syHmwE1WFIrXedcg/640?wxfrom=12&tp=wxpic&usePicPrefetch=1&wx_fmt=png&from=appmsg", +"https://img.36krcdn.com/hsossms/20241221/v2_40c28bcceafc4905b8612d6dce7a6a2a@000000_oswg116731oswg1280oswg545_img_000?x-oss-process=image/resize,m_mfit,w_600,h_400,limit_0/crop,w_600,h_400,g_center", +"http://mmbiz.qpic.cn/mmbiz_png/K85bvE9rzFOgDvibAsz4S0sZqv4O8spfH2mhvOMWicLDRMib7xiaWTMhGnAmXK7qoxQafrSw4XH0r88XbJ6aVAydqw/300?wx_fmt=png", +"https://bootcdn.xuexi.cn/18600410326/bd19863611993ad460d1c23fa910fc00.png", +"https://bootcdn.xuexi.cn/18600410326/69830c9e173b5374aa9b6de43a912e4d.png", +"https://bootcdn.xuexi.cn/18600410326/0458c43bba70d60ca77d6f158835dd6c.png", +"https://bootcdn.xuexi.cn/18600410326/1398b93f1f4273536e56e8899ad46d17.png", +"https://bootcdn.xuexi.cn/18600410326/963274d57bd3c27e3c262984887c9e48.png", +] + +async def extract_text_from_url(url): + for model in vl_models: + print(f"running {model} ...\n") + start_time = time.time() + llm_output = await llm([{"role": "user", + "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}}, + {"type": "text", "text": image_system}]}], model=model) + print(f"cn prompt output: \n{llm_output}\n") + print(f"time spent: {time.time() - start_time}\n") + start_time = time.time() + llm_output = await llm([{"role": "user", + "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}}, + {"type": "text", "text": image_system_en}]}], model=model) + print(f"en prompt output: \n{llm_output}\n") + print(f"time spent: {time.time() - start_time}\n") + +if __name__ == '__main__': + for url in pic_url_test_list: + print(f"testing {url} ...\n") + asyncio.run(extract_text_from_url(url))