mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-03 02:54:37 +08:00
128 lines
5.5 KiB
Python
128 lines
5.5 KiB
Python
from llms.openai_wrapper import openai_llm
|
||
# from llms.siliconflow_wrapper import sfa_llm
|
||
import re
|
||
from utils.general_utils import get_logger_level
|
||
from loguru import logger
|
||
from utils.pb_api import PbTalker
|
||
import os
|
||
import locale
|
||
|
||
|
||
get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-3.5-turbo")
|
||
rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-3.5-turbo")
|
||
|
||
project_dir = os.environ.get("PROJECT_DIR", "")
|
||
if project_dir:
|
||
os.makedirs(project_dir, exist_ok=True)
|
||
logger_file = os.path.join(project_dir, 'insights.log')
|
||
dsw_log = get_logger_level()
|
||
logger.add(
|
||
logger_file,
|
||
level=dsw_log,
|
||
backtrace=True,
|
||
diagnose=True,
|
||
rotation="50 MB"
|
||
)
|
||
|
||
pb = PbTalker(logger)
|
||
|
||
focus_data = pb.read(collection_name='tags', filter=f'activated=True')
|
||
focus_list = [item["name"] for item in focus_data if item["name"]]
|
||
focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]}
|
||
|
||
sys_language, _ = locale.getdefaultlocale()
|
||
|
||
if sys_language == 'zh_CN':
|
||
|
||
system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型列表进行分析。类型列表如下:
|
||
{focus_list}
|
||
|
||
如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型,并提供仅包含时间、地点、人物和事件的一句话信息摘要:
|
||
<tag>类型名称</tag>仅包含时间、地点、人物和事件的一句话信息摘要
|
||
|
||
如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。
|
||
务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的tag,不要重复输出;3、仅用一句话做信息摘要,且仅包含时间、地点、人物和事件;4、严格遵循给定的格式输出。'''
|
||
|
||
rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要,不要输出其他的信息。'''
|
||
|
||
else:
|
||
|
||
system_prompt = f'''Please carefully read the user-inputted news content and analyze it based on the provided list of categories:
|
||
{focus_list}
|
||
|
||
If the news contains any information related to the above categories, mark the type of information using the following format and provide a one-sentence summary containing only the time, location, who involved, and the event:
|
||
<tag>Category Name</tag> One-sentence summary including only time, location, who, and event.
|
||
|
||
If the news includes multiple pieces of information, analyze each one separately and output them in a line-by-line format. If the news does not involve any of the listed categories, simply output: N/A.
|
||
Important guidelines to follow: 1) Adhere strictly to the original news content, do not provide information not contained in the original text; 2) For the same event, select only the most fitting tag, avoiding duplicate outputs; 3) Summarize using just one sentence, and limit it to time, location, who, and event only; 4) Strictly comply with the given output format.'''
|
||
|
||
rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information."
|
||
|
||
|
||
def get_info(article_content: str) -> list[dict]:
|
||
# logger.debug(f'receive new article_content:\n{article_content}')
|
||
result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}],
|
||
model=get_info_model, logger=logger)
|
||
|
||
# results = pattern.findall(result)
|
||
texts = result.split('<tag>')
|
||
texts = [_.strip() for _ in texts if '</tag>' in _.strip()]
|
||
if not texts:
|
||
logger.info(f'can not find info, llm result:\n{result}')
|
||
return []
|
||
|
||
cache = []
|
||
for text in texts:
|
||
try:
|
||
strings = text.split('</tag>')
|
||
tag = strings[0]
|
||
tag = tag.strip()
|
||
if tag not in focus_list:
|
||
logger.info(f'tag not in focus_list: {tag}, aborting')
|
||
continue
|
||
info = ''.join(strings[1:])
|
||
info = info.strip()
|
||
except Exception as e:
|
||
logger.info(f'parse error: {e}')
|
||
tag = ''
|
||
info = ''
|
||
|
||
if not info or not tag:
|
||
logger.info(f'parse failed-{text}')
|
||
continue
|
||
|
||
if len(info) < 7:
|
||
logger.info(f'info too short, possible invalid: {info}')
|
||
continue
|
||
|
||
if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'):
|
||
logger.info(f'no relevant info: {text}')
|
||
continue
|
||
|
||
while info.endswith('"'):
|
||
info = info[:-1]
|
||
info = info.strip()
|
||
|
||
# 拼接下来源信息
|
||
sources = re.findall(r'\[from (.*?)]', article_content)
|
||
if sources and sources[0]:
|
||
info = f"[from {sources[0]}] {info}"
|
||
|
||
cache.append({'content': info, 'tag': focus_dict[tag]})
|
||
|
||
return cache
|
||
|
||
|
||
def info_rewrite(contents: list[str]) -> str:
|
||
context = f"<content>{'</content><content>'.join(contents)}</content>"
|
||
try:
|
||
result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}],
|
||
model=rewrite_model, temperature=0.1, logger=logger)
|
||
return result.strip()
|
||
except Exception as e:
|
||
if logger:
|
||
logger.warning(f'rewrite process llm generate failed: {e}')
|
||
else:
|
||
print(f'rewrite process llm generate failed: {e}')
|
||
return ''
|