wiseflow/dashboard/__init__.py

179 lines
8.2 KiB
Python
Raw Permalink Normal View History

2024-04-07 09:37:47 +08:00
import os
import time
import json
import uuid
2024-04-30 12:05:44 +08:00
from get_report import get_report, logger, pb
2024-04-07 09:37:47 +08:00
from get_search import search_insight
from tranlsation_volcengine import text_translate
class BackendService:
2024-04-29 23:06:17 +08:00
def __init__(self):
2024-04-07 09:37:47 +08:00
self.project_dir = os.environ.get("PROJECT_DIR", "")
# 1. base initialization
2024-04-29 23:06:17 +08:00
self.cache_url = os.path.join(self.project_dir, 'backend_service')
2024-04-07 09:37:47 +08:00
os.makedirs(self.cache_url, exist_ok=True)
# 2. load the llm
# self.llm = LocalLlmWrapper()
self.memory = {}
# self.scholar = Scholar(initial_file_dir=os.path.join(self.project_dir, "files"), use_gpu=use_gpu)
2024-04-29 23:06:17 +08:00
logger.info('backend service init success.')
2024-04-07 09:37:47 +08:00
def report(self, insight_id: str, topics: list[str], comment: str) -> dict:
2024-04-29 23:06:17 +08:00
logger.debug(f'got new report request insight_id {insight_id}')
2024-12-05 12:11:28 +08:00
insight = pb.read('agents', filter=f'id="{insight_id}"')
2024-04-07 09:37:47 +08:00
if not insight:
2024-04-29 23:06:17 +08:00
logger.error(f'insight {insight_id} not found')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, 'insight not found')
article_ids = insight[0]['articles']
if not article_ids:
2024-04-29 23:06:17 +08:00
logger.error(f'insight {insight_id} has no articles')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, 'can not find articles for insight')
2024-04-17 14:02:25 +08:00
article_list = [pb.read('articles', fields=['title', 'abstract', 'content', 'url', 'publish_time'], filter=f'id="{_id}"')
2024-04-07 09:37:47 +08:00
for _id in article_ids]
article_list = [_article[0] for _article in article_list if _article]
if not article_list:
2024-04-29 23:06:17 +08:00
logger.debug(f'{insight_id} has no valid articles')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, f'{insight_id} has no valid articles')
content = insight[0]['content']
if insight_id in self.memory:
memory = self.memory[insight_id]
else:
memory = ''
docx_file = os.path.join(self.cache_url, f'{insight_id}_{uuid.uuid4()}.docx')
2024-04-30 12:05:44 +08:00
flag, memory = get_report(content, article_list, memory, topics, comment, docx_file)
2024-04-07 09:37:47 +08:00
self.memory[insight_id] = memory
if flag:
file = open(docx_file, 'rb')
2024-12-05 12:11:28 +08:00
message = pb.upload('agents', insight_id, 'docx', f'{insight_id}.docx', file)
2024-04-07 09:37:47 +08:00
file.close()
if message:
2024-04-30 12:05:44 +08:00
logger.debug(f'report success finish and update to: {message}')
2024-04-07 09:37:47 +08:00
return self.build_out(11, message)
else:
2024-04-29 23:06:17 +08:00
logger.error(f'{insight_id} report generate successfully, however failed to update to pb.')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, 'report generate successfully, however failed to update to pb.')
else:
2024-04-29 23:06:17 +08:00
logger.error(f'{insight_id} failed to generate report, finish.')
2024-04-07 09:37:47 +08:00
return self.build_out(-11, 'report generate failed.')
def build_out(self, flag: int, answer: str = "") -> dict:
return {"flag": flag, "result": [{"type": "text", "answer": answer}]}
def translate(self, article_ids: list[str]) -> dict:
"""
2024-06-13 21:08:58 +08:00
just for chinese users
2024-04-07 09:37:47 +08:00
"""
2024-04-29 23:06:17 +08:00
logger.debug(f'got new translate task {article_ids}')
2024-04-07 09:37:47 +08:00
flag = 11
msg = ''
key_cache = []
en_texts = []
k = 1
for article_id in article_ids:
2024-04-17 14:02:25 +08:00
raw_article = pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
2024-04-07 09:37:47 +08:00
if not raw_article or not raw_article[0]:
2024-04-29 23:06:17 +08:00
logger.warning(f'get article {article_id} failed, skipping')
2024-04-07 09:37:47 +08:00
flag = -2
msg += f'get article {article_id} failed, skipping\n'
continue
if raw_article[0]['translation_result']:
2024-04-29 23:06:17 +08:00
logger.debug(f'{article_id} translation_result already exist, skipping')
2024-04-07 09:37:47 +08:00
continue
key_cache.append(article_id)
en_texts.append(raw_article[0]['title'])
en_texts.append(raw_article[0]['abstract'])
if len(en_texts) < 16:
continue
2024-04-29 23:06:17 +08:00
logger.debug(f'translate process - batch {k}')
translate_result = text_translate(en_texts, logger=logger)
2024-04-07 09:37:47 +08:00
if translate_result and len(translate_result) == 2*len(key_cache):
for i in range(0, len(translate_result), 2):
2024-04-17 14:02:25 +08:00
related_id = pb.add(collection_name='article_translation', body={'title': translate_result[i], 'abstract': translate_result[i+1], 'raw': key_cache[int(i/2)]})
2024-04-07 09:37:47 +08:00
if not related_id:
2024-04-29 23:06:17 +08:00
logger.warning(f'write article_translation {key_cache[int(i/2)]} failed')
2024-04-07 09:37:47 +08:00
else:
2024-04-17 14:02:25 +08:00
_ = pb.update(collection_name='articles', id=key_cache[int(i/2)], body={'translation_result': related_id})
2024-04-07 09:37:47 +08:00
if not _:
2024-04-29 23:06:17 +08:00
logger.warning(f'update article {key_cache[int(i/2)]} failed')
logger.debug('done')
2024-04-07 09:37:47 +08:00
else:
flag = -6
2024-04-29 23:06:17 +08:00
logger.warning(f'translate process - api out of service, can not continue job, aborting batch {key_cache}')
2024-04-07 09:37:47 +08:00
msg += f'failed to batch {key_cache}'
en_texts = []
key_cache = []
# 10次停1s避免qps超载
k += 1
if k % 10 == 0:
2024-04-29 23:06:17 +08:00
logger.debug('max token limited - sleep 1s')
2024-04-07 09:37:47 +08:00
time.sleep(1)
if en_texts:
2024-04-29 23:06:17 +08:00
logger.debug(f'translate process - batch {k}')
translate_result = text_translate(en_texts, logger=logger)
2024-04-07 09:37:47 +08:00
if translate_result and len(translate_result) == 2*len(key_cache):
for i in range(0, len(translate_result), 2):
2024-04-17 14:02:25 +08:00
related_id = pb.add(collection_name='article_translation', body={'title': translate_result[i], 'abstract': translate_result[i+1], 'raw': key_cache[int(i/2)]})
2024-04-07 09:37:47 +08:00
if not related_id:
2024-04-29 23:06:17 +08:00
logger.warning(f'write article_translation {key_cache[int(i/2)]} failed')
2024-04-07 09:37:47 +08:00
else:
2024-04-17 14:02:25 +08:00
_ = pb.update(collection_name='articles', id=key_cache[int(i/2)], body={'translation_result': related_id})
2024-04-07 09:37:47 +08:00
if not _:
2024-04-29 23:06:17 +08:00
logger.warning(f'update article {key_cache[int(i/2)]} failed')
logger.debug('done')
2024-04-07 09:37:47 +08:00
else:
2024-04-29 23:06:17 +08:00
logger.warning(f'translate process - api out of service, can not continue job, aborting batch {key_cache}')
2024-04-07 09:37:47 +08:00
msg += f'failed to batch {key_cache}'
flag = -6
2024-04-29 23:06:17 +08:00
logger.debug('translation job done.')
2024-04-07 09:37:47 +08:00
return self.build_out(flag, msg)
def more_search(self, insight_id: str) -> dict:
2024-04-29 23:06:17 +08:00
logger.debug(f'got search request for insight {insight_id}')
2024-12-05 12:11:28 +08:00
insight = pb.read('agents', filter=f'id="{insight_id}"')
2024-04-07 09:37:47 +08:00
if not insight:
2024-04-29 23:06:17 +08:00
logger.error(f'insight {insight_id} not found')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, 'insight not found')
article_ids = insight[0]['articles']
2024-04-08 17:58:29 +08:00
if article_ids:
2024-04-17 14:02:25 +08:00
article_list = [pb.read('articles', fields=['url'], filter=f'id="{_id}"') for _id in article_ids]
2024-04-08 17:58:29 +08:00
url_list = [_article[0]['url'] for _article in article_list if _article]
else:
url_list = []
2024-04-07 09:37:47 +08:00
2024-04-29 23:06:17 +08:00
flag, search_result = search_insight(insight[0]['content'], logger, url_list)
2024-04-07 09:37:47 +08:00
if flag <= 0:
2024-04-29 23:06:17 +08:00
logger.debug('no search result, nothing happen')
2024-04-08 17:58:29 +08:00
return self.build_out(flag, 'search engine error or no result')
2024-04-07 09:37:47 +08:00
for item in search_result:
2024-04-17 14:02:25 +08:00
new_article_id = pb.add(collection_name='articles', body=item)
2024-04-07 09:37:47 +08:00
if new_article_id:
article_ids.append(new_article_id)
else:
2024-04-29 23:06:17 +08:00
logger.warning(f'add article {item} failed, writing to cache_file')
2024-04-07 09:37:47 +08:00
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
json.dump(item, f, ensure_ascii=False, indent=4)
2024-12-05 12:11:28 +08:00
message = pb.update(collection_name='agents', id=insight_id, body={'articles': article_ids})
2024-04-07 09:37:47 +08:00
if message:
2024-04-30 12:05:44 +08:00
logger.debug(f'insight search success finish and update to: {message}')
2024-04-07 09:37:47 +08:00
return self.build_out(11, insight_id)
else:
2024-04-29 23:06:17 +08:00
logger.error(f'{insight_id} search success, however failed to update to pb.')
2024-04-07 09:37:47 +08:00
return self.build_out(-2, 'search success, however failed to update to pb.')