wiseflow/test/get_visual_info_for_samples.py

import os, sys
import asyncio

current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)  # get parent dir
sys.path.append(project_root)

from core.llms.openai_wrapper import openai_llm as llm

async def main(task: list):
    vl_model = os.environ.get("VL_MODEL", "")
    if not vl_model:
        print("错误: VL_MODEL not set, will skip extracting info from img, some info may be lost!")
        sys.exit(1)
    cache = {}
    for url in task:
        llm_output = await llm([{"role": "user",
                                "content": [{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
                                                {"type": "text",
                                                "text": "提取图片中的所有文字，如果图片不包含文字或者文字很少或者你判断图片仅是网站logo、商标、图标等，则输出NA。注意请仅输出提取出的文字，不要输出别的任何内容。"}]}],
                                   model=vl_model)
        cache[url] = llm_output
    return cache


if __name__ == '__main__':
    import argparse
    import time
    import json
    import re

    parser = argparse.ArgumentParser()
    parser.add_argument('--test_file', '-F', type=str, default='')
    parser.add_argument('--sample_dir', '-D', type=str, default='')
    args = parser.parse_args()

    test_file = args.test_file
    sample_dir = args.sample_dir

    files = []
    if test_file:
        files.append(test_file)
    
    if sample_dir:
        files.extend([os.path.join(sample_dir, file) for file in os.listdir(sample_dir)])

    for file in files:
        if not file.endswith('sample.json'): continue

        with open(file, 'r') as f:
            sample = json.load(f)
        
        link_dict = sample['link_dict'].copy()
        text = sample['text']

        to_be_replaces = {}
        pattern = r'§to_be_recognized_by_visual_llm_(.*?)§'
        for url, des in link_dict.items():
            matches = re.findall(pattern, des)
            if matches:
                for img_url in matches:
                    # 替换原始描述中的标记
                    des = des.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', img_url)
                    link_dict[url] = des
                    if img_url in to_be_replaces:
                        to_be_replaces[img_url].append(url)
                    else:
                        to_be_replaces[img_url] = [url]

        matches = re.findall(pattern, text)
        if matches:
            for img_url in matches:
                text = text.replace(f'§to_be_recognized_by_visual_llm_{img_url}§', f'h{img_url}')
                img_url = f'h{img_url}'
                if img_url in to_be_replaces:
                    to_be_replaces[img_url].append("content")
                else:
                    to_be_replaces[img_url] = ["content"]

        start_time = time.time()
        print(f"开始提取图片信息")
        result = asyncio.run(main(list(to_be_replaces.keys())))
        end_time = time.time()
        print(f"提取图片信息完成，耗时: {end_time - start_time}秒")

        for img_url, content in result.items():
            for url in to_be_replaces[img_url]:
                if url == "content":
                    text = text.replace(img_url, content)
                else:
                    link_dict[url] = link_dict[url].replace(img_url, content)
        
        if len(link_dict) != len(sample['link_dict']):
            print(f"提取图片信息后，link_dict长度发生变化，原长度: {len(sample['link_dict'])}, 新长度: {len(link_dict)}")
        
        sample['text'] = text
        sample['link_dict'] = link_dict
        new_file = file.replace('.json', '_recognized.json')

        with open(new_file, 'w', encoding='utf-8') as f:
            json.dump(sample, f, indent=4, ensure_ascii=False)