wiseflow/client/backend/work_process.py

import os
import json
import requests
from get_logger import get_logger
from datetime import datetime, timedelta, date
from scrapers import scraper_map
from scrapers.general_scraper import general_scraper
from pb_api import PbTalker
from urllib.parse import urlparse
from get_insight import get_insight
from general_utils import is_chinese
from tranlsation_volcengine import text_translate
import concurrent.futures


# 一般用于第一次爬虫时，避免抓取过多太久的文章，同时超过这个天数的数据库文章也不会再用来匹配insight
expiration_date = datetime.now() - timedelta(days=90)
expiration_date = expiration_date.date()
expiration_str = expiration_date.strftime("%Y%m%d")


class ServiceProcesser:
    def __init__(self, name: str = 'Work_Processor', record_snapshot: bool = False):
        self.name = name
        self.project_dir = os.environ.get("PROJECT_DIR", "")
        # 1. base initialization
        self.cache_url = os.path.join(self.project_dir, name)
        os.makedirs(self.cache_url, exist_ok=True)
        self.logger = get_logger(name=self.name, file=os.path.join(self.project_dir, f'{self.name}.log'))
        self.pb = PbTalker(self.logger)

        # 2. load the llm
        # self.llm = LocalLlmWrapper() # if you use the local-llm

        if record_snapshot:
            snap_short_server = os.environ.get('SNAPSHOTS_SERVER', '')
            if not snap_short_server:
                raise Exception('SNAPSHOTS_SERVER is not set.')
            self.snap_short_server = f"http://{snap_short_server}"
        else:
            self.snap_short_server = None

        self.logger.info(f'{self.name} init success.')

    def __call__(self, expiration: date = expiration_date, sites: list[str] = None):
        # 先清空一下cache
        self.logger.info(f'wake, prepare to work, now is {datetime.now()}')
        cache = {}
        self.logger.debug(f'clear cache -- {cache}')
        # 从pb数据库中读取所有文章url
        # 这里publish_time用int格式，综合考虑下这个是最容易操作的模式，虽然糙了点
        existing_articles = self.pb.read(collection_name='articles', fields=['id', 'title', 'url'], filter=f'publish_time>{expiration_str}')
        all_title = {}
        existings = []
        for article in existing_articles:
            all_title[article['title']] = article['id']
            existings.append(article['url'])

        # 定义扫描源列表，如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况，这时应该使用通用爬虫
        sources = sites if sites else list(scraper_map.keys())
        new_articles = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for site in sources:
                if site in scraper_map:
                    futures.append(executor.submit(scraper_map[site], expiration, existings))
                else:
                    futures.append(executor.submit(general_scraper, site, expiration, existings))
            concurrent.futures.wait(futures)
            for future in futures:
                try:
                    new_articles.extend(future.result())
                except Exception as e:
                    self.logger.error(f'error when scraping-- {e}')

        for value in new_articles:
            if not value:
                continue
            from_site = urlparse(value['url']).netloc
            from_site = from_site.replace('www.', '')
            from_site = from_site.split('.')[0]
            if value['abstract']:
                value['abstract'] = f"({from_site} 报道){value['abstract']}"
            value['content'] = f"({from_site} 报道){value['content']}"
            value['images'] = json.dumps(value['images'])

            article_id = self.pb.add(collection_name='articles', body=value)

            if article_id:
                cache[article_id] = value
                all_title[value['title']] = article_id
            else:
                self.logger.warning(f'add article {value} failed, writing to cache_file')
                with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f:
                    json.dump(value, f, ensure_ascii=False, indent=4)

        if not cache:
            self.logger.warning(f'no new articles. now is {datetime.now()}')
            return

        # insight 流程
        new_insights = get_insight(cache, all_title, logger=self.logger)
        if new_insights:
            for insight in new_insights:
                if not insight['content']:
                    continue
                insight_id = self.pb.add(collection_name='insights', body=insight)
                if not insight_id:
                    self.logger.warning(f'write insight {insight} to pb failed, writing to cache_file')
                    with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a', encoding='utf-8') as f:
                        json.dump(insight, f, ensure_ascii=False, indent=4)
                for article_id in insight['articles']:
                    raw_article = self.pb.read(collection_name='articles', fields=['abstract', 'title', 'translation_result'], filter=f'id="{article_id}"')
                    if not raw_article or not raw_article[0]:
                        self.logger.warning(f'get article {article_id} failed, skipping')
                        continue
                    if raw_article[0]['translation_result']:
                        continue
                    if is_chinese(raw_article[0]['title']):
                        continue
                    translate_text = text_translate([raw_article[0]['title'], raw_article[0]['abstract']], target_language='zh', logger=self.logger)
                    if translate_text:
                        related_id = self.pb.add(collection_name='article_translation', body={'title': translate_text[0], 'abstract': translate_text[1], 'raw': article_id})
                        if not related_id:
                            self.logger.warning(f'write article_translation {article_id} failed')
                        else:
                            _ = self.pb.update(collection_name='articles', id=article_id, body={'translation_result': related_id})
                            if not _:
                                self.logger.warning(f'update article {article_id} failed')
                    else:
                        self.logger.warning(f'translate article {article_id} failed')
        else:
            # 尝试把所有文章的title作为insigts，这是备选方案
            if len(cache) < 25:
                self.logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
                for key, value in cache.items():
                    if value['title']:
                        if is_chinese(value['title']):
                            text_for_insight = value['title']
                        else:
                            text_for_insight = text_translate([value['title']], logger=self.logger)
                        if text_for_insight:
                            insight_id = self.pb.add(collection_name='insights',
                                                     body={'content': text_for_insight[0], 'articles': [key]})
                            if not insight_id:
                                self.logger.warning(f'write insight {text_for_insight[0]} to pb failed, writing to cache_file')
                                with open(os.path.join(self.cache_url, 'cache_insights.json'), 'a',
                                          encoding='utf-8') as f:
                                    json.dump({'content': text_for_insight[0], 'articles': [key]}, f, ensure_ascii=False, indent=4)
            else:
                self.logger.warning('generate_insights-error: can not generate insights, pls re-try')
        self.logger.info(f'work done, now is {datetime.now()}')

        if self.snap_short_server:
            self.logger.info(f'now starting article snapshot with {self.snap_short_server}')
            for key, value in cache.items():
                if value['url']:
                    try:
                        snapshot = requests.get(f"{self.snap_short_server}/zip", {'url': value['url']}, timeout=60)
                        file = open(snapshot.text, 'rb')
                        _ = self.pb.upload('articles', key, 'snapshot', key, file)
                        file.close()
                    except Exception as e:
                        self.logger.warning(f'error when snapshot {value["url"]}, {e}')
            self.logger.info(f'now snapshot done, now is {datetime.now()}')