mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
bug fix
This commit is contained in:
parent
148e145365
commit
cb32415d28
@ -3,25 +3,9 @@ import time
|
|||||||
import json
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
from pb_api import PbTalker
|
from pb_api import PbTalker
|
||||||
from get_report import get_report
|
from get_report import get_report, logger, pb
|
||||||
from get_search import search_insight
|
from get_search import search_insight
|
||||||
from tranlsation_volcengine import text_translate
|
from tranlsation_volcengine import text_translate
|
||||||
from general_utils import get_logger_level
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
|
||||||
os.makedirs(project_dir, exist_ok=True)
|
|
||||||
logger_file = os.path.join(project_dir, 'backend_service.log')
|
|
||||||
dsw_log = get_logger_level()
|
|
||||||
|
|
||||||
logger.add(
|
|
||||||
logger_file,
|
|
||||||
level=dsw_log,
|
|
||||||
backtrace=True,
|
|
||||||
diagnose=True,
|
|
||||||
rotation="50 MB"
|
|
||||||
)
|
|
||||||
pb = PbTalker(logger)
|
|
||||||
|
|
||||||
|
|
||||||
class BackendService:
|
class BackendService:
|
||||||
@ -72,7 +56,7 @@ class BackendService:
|
|||||||
memory = ''
|
memory = ''
|
||||||
|
|
||||||
docx_file = os.path.join(self.cache_url, f'{insight_id}_{uuid.uuid4()}.docx')
|
docx_file = os.path.join(self.cache_url, f'{insight_id}_{uuid.uuid4()}.docx')
|
||||||
flag, memory = get_report(content, article_list, memory, topics, comment, docx_file, logger=logger)
|
flag, memory = get_report(content, article_list, memory, topics, comment, docx_file)
|
||||||
self.memory[insight_id] = memory
|
self.memory[insight_id] = memory
|
||||||
|
|
||||||
if flag:
|
if flag:
|
||||||
@ -80,7 +64,7 @@ class BackendService:
|
|||||||
message = pb.upload('insights', insight_id, 'docx', f'{insight_id}.docx', file)
|
message = pb.upload('insights', insight_id, 'docx', f'{insight_id}.docx', file)
|
||||||
file.close()
|
file.close()
|
||||||
if message:
|
if message:
|
||||||
logger.debug(f'report success finish and update to pb-{message}')
|
logger.debug(f'report success finish and update to: {message}')
|
||||||
return self.build_out(11, message)
|
return self.build_out(11, message)
|
||||||
else:
|
else:
|
||||||
logger.error(f'{insight_id} report generate successfully, however failed to update to pb.')
|
logger.error(f'{insight_id} report generate successfully, however failed to update to pb.')
|
||||||
@ -204,7 +188,7 @@ class BackendService:
|
|||||||
|
|
||||||
message = pb.update(collection_name='insights', id=insight_id, body={'articles': article_ids})
|
message = pb.update(collection_name='insights', id=insight_id, body={'articles': article_ids})
|
||||||
if message:
|
if message:
|
||||||
logger.debug(f'insight search success finish and update to pb-{message}')
|
logger.debug(f'insight search success finish and update to: {message}')
|
||||||
return self.build_out(11, insight_id)
|
return self.build_out(11, insight_id)
|
||||||
else:
|
else:
|
||||||
logger.error(f'{insight_id} search success, however failed to update to pb.')
|
logger.error(f'{insight_id} search success, however failed to update to pb.')
|
||||||
|
@ -3,27 +3,11 @@
|
|||||||
"""
|
"""
|
||||||
import schedule
|
import schedule
|
||||||
import time
|
import time
|
||||||
import os
|
from get_insight import pb, logger
|
||||||
from work_process import ServiceProcesser
|
from work_process import ServiceProcesser
|
||||||
from general_utils import get_logger_level
|
|
||||||
from loguru import logger
|
|
||||||
from pb_api import PbTalker
|
|
||||||
|
|
||||||
project_dir = os.environ.get("PROJECT_DIR", "")
|
|
||||||
os.makedirs(project_dir, exist_ok=True)
|
|
||||||
logger_file = os.path.join(project_dir, 'scanning_task.log')
|
|
||||||
dsw_log = get_logger_level()
|
|
||||||
|
|
||||||
logger.add(
|
sp = ServiceProcesser()
|
||||||
logger_file,
|
|
||||||
level=dsw_log,
|
|
||||||
backtrace=True,
|
|
||||||
diagnose=True,
|
|
||||||
rotation="50 MB"
|
|
||||||
)
|
|
||||||
pb = PbTalker(logger)
|
|
||||||
|
|
||||||
sp = ServiceProcesser(pb=pb, logger=logger)
|
|
||||||
counter = 0
|
counter = 0
|
||||||
|
|
||||||
|
|
||||||
@ -37,8 +21,8 @@ def task():
|
|||||||
continue
|
continue
|
||||||
if counter % site['per_hours'] == 0:
|
if counter % site['per_hours'] == 0:
|
||||||
urls.append(site['url'])
|
urls.append(site['url'])
|
||||||
print(f'\033[0;32m task execute loop {counter}\033[0m')
|
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
|
||||||
print(urls)
|
logger.info(urls)
|
||||||
if urls:
|
if urls:
|
||||||
sp(sites=urls)
|
sp(sites=urls)
|
||||||
else:
|
else:
|
||||||
|
@ -8,7 +8,24 @@ from general_utils import isChinesePunctuation, is_chinese
|
|||||||
from tranlsation_volcengine import text_translate
|
from tranlsation_volcengine import text_translate
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from pb_api import pb
|
import os
|
||||||
|
from general_utils import get_logger_level
|
||||||
|
from loguru import logger
|
||||||
|
from pb_api import PbTalker
|
||||||
|
|
||||||
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
|
os.makedirs(project_dir, exist_ok=True)
|
||||||
|
logger_file = os.path.join(project_dir, 'scanning_task.log')
|
||||||
|
dsw_log = get_logger_level()
|
||||||
|
|
||||||
|
logger.add(
|
||||||
|
logger_file,
|
||||||
|
level=dsw_log,
|
||||||
|
backtrace=True,
|
||||||
|
diagnose=True,
|
||||||
|
rotation="50 MB"
|
||||||
|
)
|
||||||
|
pb = PbTalker(logger)
|
||||||
|
|
||||||
|
|
||||||
max_tokens = 4000
|
max_tokens = 4000
|
||||||
@ -69,7 +86,7 @@ _rewrite_insight_prompt = f'''你是一名{character},你将被给到一个新
|
|||||||
不管新闻列表是何种语言,请仅用中文输出分析结果。'''
|
不管新闻列表是何种语言,请仅用中文输出分析结果。'''
|
||||||
|
|
||||||
|
|
||||||
def _parse_insight(article_text: str, cache: dict, logger=None) -> (bool, dict):
|
def _parse_insight(article_text: str, cache: dict) -> (bool, dict):
|
||||||
input_length = len(cache)
|
input_length = len(cache)
|
||||||
result = dashscope_llm([{'role': 'system', 'content': _first_stage_prompt}, {'role': 'user', 'content': article_text}],
|
result = dashscope_llm([{'role': 'system', 'content': _first_stage_prompt}, {'role': 'user', 'content': article_text}],
|
||||||
'qwen1.5-72b-chat', logger=logger)
|
'qwen1.5-72b-chat', logger=logger)
|
||||||
@ -116,7 +133,7 @@ def _parse_insight(article_text: str, cache: dict, logger=None) -> (bool, dict):
|
|||||||
return False, cache
|
return False, cache
|
||||||
|
|
||||||
|
|
||||||
def _rewrite_insight(context: str, logger=None) -> (bool, str):
|
def _rewrite_insight(context: str) -> (bool, str):
|
||||||
result = dashscope_llm([{'role': 'system', 'content': _rewrite_insight_prompt}, {'role': 'user', 'content': context}],
|
result = dashscope_llm([{'role': 'system', 'content': _rewrite_insight_prompt}, {'role': 'user', 'content': context}],
|
||||||
'qwen1.5-72b-chat', logger=logger)
|
'qwen1.5-72b-chat', logger=logger)
|
||||||
if result:
|
if result:
|
||||||
@ -142,7 +159,7 @@ def _rewrite_insight(context: str, logger=None) -> (bool, str):
|
|||||||
return False, text
|
return False, text
|
||||||
|
|
||||||
|
|
||||||
def get_insight(articles: dict, titles: dict, logger=None) -> list:
|
def get_insight(articles: dict, titles: dict) -> list:
|
||||||
context = ''
|
context = ''
|
||||||
cache = {}
|
cache = {}
|
||||||
for value in articles.values():
|
for value in articles.values():
|
||||||
@ -162,7 +179,7 @@ def get_insight(articles: dict, titles: dict, logger=None) -> list:
|
|||||||
if len(context) < max_tokens:
|
if len(context) < max_tokens:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
flag, cache = _parse_insight(context, cache, logger)
|
flag, cache = _parse_insight(context, cache)
|
||||||
if flag:
|
if flag:
|
||||||
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
||||||
|
|
||||||
@ -170,7 +187,7 @@ def get_insight(articles: dict, titles: dict, logger=None) -> list:
|
|||||||
# 据说频繁调用会引发性能下降,每次调用后休息1s。现在轮替调用qwen-72b和max,所以不必了。
|
# 据说频繁调用会引发性能下降,每次调用后休息1s。现在轮替调用qwen-72b和max,所以不必了。
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
if context:
|
if context:
|
||||||
flag, cache = _parse_insight(context, cache, logger)
|
flag, cache = _parse_insight(context, cache)
|
||||||
if flag:
|
if flag:
|
||||||
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
logger.warning(f'following articles may not be completely analyzed: \n{context}')
|
||||||
|
|
||||||
@ -230,7 +247,7 @@ def get_insight(articles: dict, titles: dict, logger=None) -> list:
|
|||||||
if not context:
|
if not context:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
flag, new_insight = _rewrite_insight(context, logger)
|
flag, new_insight = _rewrite_insight(context)
|
||||||
if flag:
|
if flag:
|
||||||
logger.warning(f'insight {key} may contain wrong')
|
logger.warning(f'insight {key} may contain wrong')
|
||||||
cache[key] = value
|
cache[key] = value
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
from llms.dashscope_wrapper import dashscope_llm
|
from llms.dashscope_wrapper import dashscope_llm
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.oxml.ns import qn
|
from docx.oxml.ns import qn
|
||||||
@ -7,7 +8,23 @@ from docx.shared import Pt, RGBColor
|
|||||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from general_utils import isChinesePunctuation
|
from general_utils import isChinesePunctuation
|
||||||
from pb_api import pb
|
from general_utils import get_logger_level
|
||||||
|
from loguru import logger
|
||||||
|
from pb_api import PbTalker
|
||||||
|
|
||||||
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
|
os.makedirs(project_dir, exist_ok=True)
|
||||||
|
logger_file = os.path.join(project_dir, 'backend_service.log')
|
||||||
|
dsw_log = get_logger_level()
|
||||||
|
|
||||||
|
logger.add(
|
||||||
|
logger_file,
|
||||||
|
level=dsw_log,
|
||||||
|
backtrace=True,
|
||||||
|
diagnose=True,
|
||||||
|
rotation="50 MB"
|
||||||
|
)
|
||||||
|
pb = PbTalker(logger)
|
||||||
|
|
||||||
# qwen-72b-chat支持最大30k输入,考虑prompt其他部分,content不应超过30000字符长度
|
# qwen-72b-chat支持最大30k输入,考虑prompt其他部分,content不应超过30000字符长度
|
||||||
# 如果换qwen-max(最大输入6k),这里就要换成6000,但这样很多文章不能分析了
|
# 如果换qwen-max(最大输入6k),这里就要换成6000,但这样很多文章不能分析了
|
||||||
@ -34,7 +51,7 @@ if not report_type:
|
|||||||
_ = pb.update(collection_name='roleplays', id=_role_config_id, body={'report_type': report_type})
|
_ = pb.update(collection_name='roleplays', id=_role_config_id, body={'report_type': report_type})
|
||||||
|
|
||||||
|
|
||||||
def get_report(insigt: str, articles: list[dict], memory: str, topics: list[str], comment: str, docx_file: str, logger=None) -> (bool, str):
|
def get_report(insigt: str, articles: list[dict], memory: str, topics: list[str], comment: str, docx_file: str) -> (bool, str):
|
||||||
zh_index = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二']
|
zh_index = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二']
|
||||||
|
|
||||||
if isChinesePunctuation(insigt[-1]):
|
if isChinesePunctuation(insigt[-1]):
|
||||||
|
@ -5,7 +5,7 @@ from datetime import datetime, timedelta, date
|
|||||||
from scrapers import scraper_map
|
from scrapers import scraper_map
|
||||||
from scrapers.general_scraper import general_scraper
|
from scrapers.general_scraper import general_scraper
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from get_insight import get_insight
|
from get_insight import get_insight, pb, logger
|
||||||
from general_utils import is_chinese
|
from general_utils import is_chinese
|
||||||
from tranlsation_volcengine import text_translate
|
from tranlsation_volcengine import text_translate
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
@ -18,13 +18,11 @@ expiration_str = expiration_date.strftime("%Y%m%d")
|
|||||||
|
|
||||||
|
|
||||||
class ServiceProcesser:
|
class ServiceProcesser:
|
||||||
def __init__(self, pb, logger, record_snapshot: bool = False):
|
def __init__(self, record_snapshot: bool = False):
|
||||||
self.project_dir = os.environ.get("PROJECT_DIR", "")
|
self.project_dir = os.environ.get("PROJECT_DIR", "")
|
||||||
# 1. base initialization
|
# 1. base initialization
|
||||||
self.cache_url = os.path.join(self.project_dir, 'scanning_task')
|
self.cache_url = os.path.join(self.project_dir, 'scanning_task')
|
||||||
os.makedirs(self.cache_url, exist_ok=True)
|
os.makedirs(self.cache_url, exist_ok=True)
|
||||||
self.pb = pb
|
|
||||||
self.logger = logger
|
|
||||||
# 2. load the llm
|
# 2. load the llm
|
||||||
# self.llm = LocalLlmWrapper() # if you use the local-llm
|
# self.llm = LocalLlmWrapper() # if you use the local-llm
|
||||||
|
|
||||||
@ -36,16 +34,16 @@ class ServiceProcesser:
|
|||||||
else:
|
else:
|
||||||
self.snap_short_server = None
|
self.snap_short_server = None
|
||||||
|
|
||||||
self.logger.info('scanning task init success.')
|
logger.info('scanning task init success.')
|
||||||
|
|
||||||
def __call__(self, expiration: date = expiration_date, sites: list[str] = None):
|
def __call__(self, expiration: date = expiration_date, sites: list[str] = None):
|
||||||
# 先清空一下cache
|
# 先清空一下cache
|
||||||
self.logger.info(f'wake, prepare to work, now is {datetime.now()}')
|
logger.info(f'wake, prepare to work, now is {datetime.now()}')
|
||||||
cache = {}
|
cache = {}
|
||||||
self.logger.debug(f'clear cache -- {cache}')
|
logger.debug(f'clear cache -- {cache}')
|
||||||
# 从pb数据库中读取所有文章url
|
# 从pb数据库中读取所有文章url
|
||||||
# 这里publish_time用int格式,综合考虑下这个是最容易操作的模式,虽然糙了点
|
# 这里publish_time用int格式,综合考虑下这个是最容易操作的模式,虽然糙了点
|
||||||
existing_articles = self.pb.read(collection_name='articles', fields=['id', 'title', 'url'], filter=f'publish_time>{expiration_str}')
|
existing_articles = pb.read(collection_name='articles', fields=['id', 'title', 'url'], filter=f'publish_time>{expiration_str}')
|
||||||
all_title = {}
|
all_title = {}
|
||||||
existings = []
|
existings = []
|
||||||
for article in existing_articles:
|
for article in existing_articles:
|
||||||
@ -61,13 +59,13 @@ class ServiceProcesser:
|
|||||||
if site in scraper_map:
|
if site in scraper_map:
|
||||||
futures.append(executor.submit(scraper_map[site], expiration, existings))
|
futures.append(executor.submit(scraper_map[site], expiration, existings))
|
||||||
else:
|
else:
|
||||||
futures.append(executor.submit(general_scraper, site, expiration, existings, self.logger))
|
futures.append(executor.submit(general_scraper, site, expiration, existings, logger))
|
||||||
concurrent.futures.wait(futures)
|
concurrent.futures.wait(futures)
|
||||||
for future in futures:
|
for future in futures:
|
||||||
try:
|
try:
|
||||||
new_articles.extend(future.result())
|
new_articles.extend(future.result())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f'error when scraping-- {e}')
|
logger.error(f'error when scraping-- {e}')
|
||||||
|
|
||||||
for value in new_articles:
|
for value in new_articles:
|
||||||
if not value:
|
if not value:
|
||||||
@ -80,81 +78,81 @@ class ServiceProcesser:
|
|||||||
value['content'] = f"({from_site} 报道){value['content']}"
|
value['content'] = f"({from_site} 报道){value['content']}"
|
||||||
value['images'] = json.dumps(value['images'])
|
value['images'] = json.dumps(value['images'])
|
||||||
|
|
||||||
article_id = self.pb.add(collection_name='articles', body=value)
|
article_id = pb.add(collection_name='articles', body=value)
|
||||||
|
|
||||||
if article_id:
|
if article_id:
|
||||||
cache[article_id] = value
|
cache[article_id] = value
|
||||||
all_title[value['title']] = article_id
|
all_title[value['title']] = article_id
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f'add article {value} failed, writing to cache_file')
|
logger.warning(f'add article {value} failed, writing to cache_file')
|
||||||
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
|
with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
|
||||||
json.dump(value, f, ensure_ascii=False, indent=4)
|
json.dump(value, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
if not cache:
|
if not cache:
|
||||||
self.logger.warning(f'no new articles. now is {datetime.now()}')
|
logger.warning(f'no new articles. now is {datetime.now()}')
|
||||||
return
|
return
|
||||||
|
|
||||||
# insight 流程
|
# insight 流程
|
||||||
new_insights = get_insight(cache, all_title, logger=self.logger)
|
new_insights = get_insight(cache, all_title)
|
||||||
if new_insights:
|
if new_insights:
|
||||||
for insight in new_insights:
|
for insight in new_insights:
|
||||||
if not insight['content']:
|
if not insight['content']:
|
||||||
continue
|
continue
|
||||||
insight_id = self.pb.add(collection_name='insights', body=insight)
|
insight_id = pb.add(collection_name='insights', body=insight)
|
||||||
if not insight_id:
|
if not insight_id:
|
||||||
self.logger.warning(f'write insight {insight} to pb failed, writing to cache_file')
|
logger.warning(f'write insight {insight} to pb failed, writing to cache_file')
|
||||||
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a', encoding='utf-8') as f:
|
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a', encoding='utf-8') as f:
|
||||||
json.dump(insight, f, ensure_ascii=False, indent=4)
|
json.dump(insight, f, ensure_ascii=False, indent=4)
|
||||||
for article_id in insight['articles']:
|
for article_id in insight['articles']:
|
||||||
raw_article = self.pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
|
raw_article = pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
|
||||||
if not raw_article or not raw_article[0]:
|
if not raw_article or not raw_article[0]:
|
||||||
self.logger.warning(f'get article {article_id} failed, skipping')
|
logger.warning(f'get article {article_id} failed, skipping')
|
||||||
continue
|
continue
|
||||||
if raw_article[0]['translation_result']:
|
if raw_article[0]['translation_result']:
|
||||||
continue
|
continue
|
||||||
if is_chinese(raw_article[0]['title']):
|
if is_chinese(raw_article[0]['title']):
|
||||||
continue
|
continue
|
||||||
translate_text = text_translate([raw_article[0]['title'], raw_article[0]['abstract']], target_language='zh', logger=self.logger)
|
translate_text = text_translate([raw_article[0]['title'], raw_article[0]['abstract']], target_language='zh', logger=logger)
|
||||||
if translate_text:
|
if translate_text:
|
||||||
related_id = self.pb.add(collection_name='article_translation', body={'title': translate_text[0], 'abstract': translate_text[1], 'raw': article_id})
|
related_id = pb.add(collection_name='article_translation', body={'title': translate_text[0], 'abstract': translate_text[1], 'raw': article_id})
|
||||||
if not related_id:
|
if not related_id:
|
||||||
self.logger.warning(f'write article_translation {article_id} failed')
|
logger.warning(f'write article_translation {article_id} failed')
|
||||||
else:
|
else:
|
||||||
_ = self.pb.update(collection_name='articles', id=article_id, body={'translation_result': related_id})
|
_ = pb.update(collection_name='articles', id=article_id, body={'translation_result': related_id})
|
||||||
if not _:
|
if not _:
|
||||||
self.logger.warning(f'update article {article_id} failed')
|
logger.warning(f'update article {article_id} failed')
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f'translate article {article_id} failed')
|
logger.warning(f'translate article {article_id} failed')
|
||||||
else:
|
else:
|
||||||
# 尝试把所有文章的title作为insigts,这是备选方案
|
# 尝试把所有文章的title作为insigts,这是备选方案
|
||||||
if len(cache) < 25:
|
if len(cache) < 25:
|
||||||
self.logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
|
logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
|
||||||
for key, value in cache.items():
|
for key, value in cache.items():
|
||||||
if value['title']:
|
if value['title']:
|
||||||
if is_chinese(value['title']):
|
if is_chinese(value['title']):
|
||||||
text_for_insight = value['title']
|
text_for_insight = value['title']
|
||||||
else:
|
else:
|
||||||
text_for_insight = text_translate([value['title']], logger=self.logger)
|
text_for_insight = text_translate([value['title']], logger=logger)
|
||||||
if text_for_insight:
|
if text_for_insight:
|
||||||
insight_id = self.pb.add(collection_name='insights', body={'content': text_for_insight[0], 'articles': [key]})
|
insight_id = pb.add(collection_name='insights', body={'content': text_for_insight[0], 'articles': [key]})
|
||||||
if not insight_id:
|
if not insight_id:
|
||||||
self.logger.warning(f'write insight {text_for_insight[0]} to pb failed, writing to cache_file')
|
logger.warning(f'write insight {text_for_insight[0]} to pb failed, writing to cache_file')
|
||||||
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a',
|
with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a',
|
||||||
encoding='utf-8') as f:
|
encoding='utf-8') as f:
|
||||||
json.dump({'content': text_for_insight[0], 'articles': [key]}, f, ensure_ascii=False, indent=4)
|
json.dump({'content': text_for_insight[0], 'articles': [key]}, f, ensure_ascii=False, indent=4)
|
||||||
else:
|
else:
|
||||||
self.logger.warning('generate_insights-error: can not generate insights, pls re-try')
|
logger.warning('generate_insights-error: can not generate insights, pls re-try')
|
||||||
self.logger.info(f'work done, now is {datetime.now()}')
|
logger.info(f'work done, now is {datetime.now()}')
|
||||||
|
|
||||||
if self.snap_short_server:
|
if self.snap_short_server:
|
||||||
self.logger.info(f'now starting article snapshot with {self.snap_short_server}')
|
logger.info(f'now starting article snapshot with {self.snap_short_server}')
|
||||||
for key, value in cache.items():
|
for key, value in cache.items():
|
||||||
if value['url']:
|
if value['url']:
|
||||||
try:
|
try:
|
||||||
snapshot = requests.get(f"{self.snap_short_server}/zip", {'url': value['url']}, timeout=60)
|
snapshot = requests.get(f"{self.snap_short_server}/zip", {'url': value['url']}, timeout=60)
|
||||||
file = open(snapshot.text, 'rb')
|
file = open(snapshot.text, 'rb')
|
||||||
_ = self.pb.upload('articles', key, 'snapshot', key, file)
|
_ = pb.upload('articles', key, 'snapshot', key, file)
|
||||||
file.close()
|
file.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f'error when snapshot {value["url"]}, {e}')
|
logger.warning(f'error when snapshot {value["url"]}, {e}')
|
||||||
self.logger.info(f'now snapshot done, now is {datetime.now()}')
|
logger.info(f'now snapshot done, now is {datetime.now()}')
|
||||||
|
Loading…
Reference in New Issue
Block a user