from llms.openai_wrapper import openai_llm # from llms.siliconflow_wrapper import sfa_llm import re from utils.general_utils import get_logger_level from loguru import logger from utils.pb_api import PbTalker import os import locale get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-3.5-turbo") rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-3.5-turbo") project_dir = os.environ.get("PROJECT_DIR", "") if project_dir: os.makedirs(project_dir, exist_ok=True) logger_file = os.path.join(project_dir, 'insights.log') dsw_log = get_logger_level() logger.add( logger_file, level=dsw_log, backtrace=True, diagnose=True, rotation="50 MB" ) pb = PbTalker(logger) focus_data = pb.read(collection_name='tags', filter=f'activated=True') focus_list = [item["name"] for item in focus_data if item["name"]] focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]} sys_language, _ = locale.getdefaultlocale() if sys_language == 'zh_CN': system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下: {focus_list} 如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型,并提供仅包含时间、地点、人物和事件的一句话信息摘要: 类型名称仅包含时间、地点、人物和事件的一句话信息摘要 如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。 务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的tag,不要重复输出;3、仅用一句话做信息摘要,且仅包含时间、地点、人物和事件;4、严格遵循给定的格式输出。''' rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要,不要输出其他的信息。''' else: system_prompt = f'''Please carefully read the user-inputted news content and analyze it based on the provided list of categories: {focus_list} If the news contains any information related to the above categories, mark the type of information using the following format and provide a one-sentence summary containing only the time, location, who involved, and the event: Category Name One-sentence summary including only time, location, who, and event. If the news includes multiple pieces of information, analyze each one separately and output them in a line-by-line format. If the news does not involve any of the listed categories, simply output: N/A. Important guidelines to follow: 1) Adhere strictly to the original news content, do not provide information not contained in the original text; 2) For the same event, select only the most fitting tag, avoiding duplicate outputs; 3) Summarize using just one sentence, and limit it to time, location, who, and event only; 4) Strictly comply with the given output format.''' rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information." def get_info(article_content: str) -> list[dict]: # logger.debug(f'receive new article_content:\n{article_content}') result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}], model=get_info_model, logger=logger) # results = pattern.findall(result) texts = result.split('') texts = [_.strip() for _ in texts if '' in _.strip()] if not texts: logger.info(f'can not find info, llm result:\n{result}') return [] cache = [] for text in texts: try: strings = text.split('') tag = strings[0] tag = tag.strip() if tag not in focus_list: logger.info(f'tag not in focus_list: {tag}, aborting') continue info = ''.join(strings[1:]) info = info.strip() except Exception as e: logger.info(f'parse error: {e}') tag = '' info = '' if not info or not tag: logger.info(f'parse failed-{text}') continue if len(info) < 7: logger.info(f'info too short, possible invalid: {info}') continue if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'): logger.info(f'no relevant info: {text}') continue while info.endswith('"'): info = info[:-1] info = info.strip() # 拼接下来源信息 sources = re.findall(r'\[from (.*?)]', article_content) if sources and sources[0]: info = f"[from {sources[0]}] {info}" cache.append({'content': info, 'tag': focus_dict[tag]}) return cache def info_rewrite(contents: list[str]) -> str: context = f"{''.join(contents)}" try: result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}], model=rewrite_model, temperature=0.1, logger=logger) return result.strip() except Exception as e: if logger: logger.warning(f'rewrite process llm generate failed: {e}') else: print(f'rewrite process llm generate failed: {e}') return ''