wiseflow/core/insights/get_info.py
2024-06-15 15:41:31 +08:00

128 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
import re
from utils.general_utils import get_logger_level
from loguru import logger
from utils.pb_api import PbTalker
import os
import locale
get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-3.5-turbo")
rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-3.5-turbo")
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'insights.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
focus_list = [item["name"] for item in focus_data if item["name"]]
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
sys_language, _ = locale.getdefaultlocale()
if sys_language == 'zh_CN':
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下:
{focus_list}
如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型,并提供仅包含时间、地点、人物和事件的一句话信息摘要:
<tag>类型名称</tag>仅包含时间、地点、人物和事件的一句话信息摘要
如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。
务必注意1、严格忠于新闻原文不得提供原文中不包含的信息2、对于同一事件仅选择一个最贴合的tag不要重复输出3、仅用一句话做信息摘要且仅包含时间、地点、人物和事件4、严格遵循给定的格式输出。'''
rewrite_prompt = '''请综合给到的内容提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要不要输出其他的信息。'''
else:
system_prompt = f'''Please carefully read the user-inputted news content and analyze it based on the provided list of categories:
{focus_list}
If the news contains any information related to the above categories, mark the type of information using the following format and provide a one-sentence summary containing only the time, location, who involved, and the event:
<tag>Category Name</tag> One-sentence summary including only time, location, who, and event.
If the news includes multiple pieces of information, analyze each one separately and output them in a line-by-line format. If the news does not involve any of the listed categories, simply output: N/A.
Important guidelines to follow: 1) Adhere strictly to the original news content, do not provide information not contained in the original text; 2) For the same event, select only the most fitting tag, avoiding duplicate outputs; 3) Summarize using just one sentence, and limit it to time, location, who, and event only; 4) Strictly comply with the given output format.'''
rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information."
def get_info(article_content: str) -> list[dict]:
# logger.debug(f'receive new article_content:\n{article_content}')
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
model=get_info_model, logger=logger)
# results = pattern.findall(result)
texts = result.split('<tag>')
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
if not texts:
logger.info(f'can not find info, llm result:\n{result}')
return []
cache = []
for text in texts:
try:
strings = text.split('</tag>')
tag = strings[0]
tag = tag.strip()
if tag not in focus_list:
logger.info(f'tag not in focus_list: {tag}, aborting')
continue
info = ''.join(strings[1:])
info = info.strip()
except Exception as e:
logger.info(f'parse error: {e}')
tag = ''
info = ''
if not info or not tag:
logger.info(f'parse failed-{text}')
continue
if len(info) < 7:
logger.info(f'info too short, possible invalid: {info}')
continue
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
logger.info(f'no relevant info: {text}')
continue
while info.endswith('"'):
info = info[:-1]
info = info.strip()
# 拼接下来源信息
sources = re.findall(r'\[from (.*?)]', article_content)
if sources and sources[0]:
info = f"[from {sources[0]}] {info}"
cache.append({'content': info, 'tag': focus_dict[tag]})
return cache
def info_rewrite(contents: list[str]) -> str:
context = f"<content>{'</content><content>'.join(contents)}</content>"
try:
result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
model=rewrite_model, temperature=0.1, logger=logger)
return result.strip()
except Exception as e:
if logger:
logger.warning(f'rewrite process llm generate failed: {e}')
else:
print(f'rewrite process llm generate failed: {e}')
return ''