from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
import re
from utils.general_utils import get_logger_level
from loguru import logger
from utils.pb_api import PbTalker
import os
import locale
get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-3.5-turbo")
rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-3.5-turbo")
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'insights.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
focus_list = [item["name"] for item in focus_data if item["name"]]
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
sys_language, _ = locale.getdefaultlocale()
if sys_language == 'zh_CN':
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下:
{focus_list}
如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型,并提供仅包含时间、地点、人物和事件的一句话信息摘要:
类型名称仅包含时间、地点、人物和事件的一句话信息摘要
如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。
务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的tag,不要重复输出;3、仅用一句话做信息摘要,且仅包含时间、地点、人物和事件;4、严格遵循给定的格式输出。'''
rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要,不要输出其他的信息。'''
else:
system_prompt = f'''Please carefully read the user-inputted news content and analyze it based on the provided list of categories:
{focus_list}
If the news contains any information related to the above categories, mark the type of information using the following format and provide a one-sentence summary containing only the time, location, who involved, and the event:
Category Name One-sentence summary including only time, location, who, and event.
If the news includes multiple pieces of information, analyze each one separately and output them in a line-by-line format. If the news does not involve any of the listed categories, simply output: N/A.
Important guidelines to follow: 1) Adhere strictly to the original news content, do not provide information not contained in the original text; 2) For the same event, select only the most fitting tag, avoiding duplicate outputs; 3) Summarize using just one sentence, and limit it to time, location, who, and event only; 4) Strictly comply with the given output format.'''
rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information."
def get_info(article_content: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
model=get_info_model, logger=logger)
# results = pattern.findall(result)
texts = result.split('')
texts = [_.strip() for _ in texts if '' in _.strip()]
if not texts:
logger.info(f'can not find info, llm result:\n{result}')
return []
cache = []
for text in texts:
try:
strings = text.split('')
tag = strings[0]
tag = tag.strip()
if tag not in focus_list:
logger.info(f'tag not in focus_list: {tag}, aborting')
continue
info = ''.join(strings[1:])
info = info.strip()
except Exception as e:
logger.info(f'parse error: {e}')
tag = ''
info = ''
if not info or not tag:
logger.info(f'parse failed-{text}')
continue
if len(info) < 7:
logger.info(f'info too short, possible invalid: {info}')
continue
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
logger.info(f'no relevant info: {text}')
continue
while info.endswith('"'):
info = info[:-1]
info = info.strip()
# 拼接下来源信息
sources = re.findall(r'\[from (.*?)]', article_content)
if sources and sources[0]:
info = f"[from {sources[0]}] {info}"
cache.append({'content': info, 'tag': focus_dict[tag]})
return cache
def info_rewrite(contents: list[str]) -> str:
context = f"{''.join(contents)}"
try:
result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
model=rewrite_model, temperature=0.1, logger=logger)
return result.strip()
except Exception as e:
if logger:
logger.warning(f'rewrite process llm generate failed: {e}')
else:
print(f'rewrite process llm generate failed: {e}')
return ''