scrapers updated

2025-01-23 02:20:20 +08:00 · 2024-06-15 15:41:31 +08:00 · 2024-06-15 15:41:31 +08:00 · 31411cd8f4
commit 31411cd8f4
parent b1dad1533f
22 changed files with 563 additions and 691 deletions
--- a/core/backend.py
+++ b/core/backend.py
@ -0,0 +1,45 @@
+from fastapi import FastAPI, BackgroundTasks
+from pydantic import BaseModel
+from typing import Literal, Optional
+from fastapi.middleware.cors import CORSMiddleware
+from insights import pipeline
+
+
+class Request(BaseModel):
+    """
+    Input model
+    input = {'user_id': str, 'type': str, 'content':str， 'addition': Optional[str]}
+    Type is one of "text", "publicMsg", "site" and "url"；
+    """
+    user_id: str
+    type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"]
+    content: str
+    addition: Optional[str] = None
+
+
+app = FastAPI(
+    title="WiseFlow Union Backend",
+    description="From Wiseflow Team.",
+    version="0.1.1",
+    openapi_url="/openapi.json"
+)
+
+app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+
+@app.get("/")
+def read_root():
+    msg = "Hello, this is Wise Union Backend, version 0.1.1"
+    return {"msg": msg}
+
+
+@app.post("/feed")
+async def call_to_feed(background_tasks: BackgroundTasks, request: Request):
+    background_tasks.add_task(pipeline, _input=request.model_dump())
+    return {"msg": "received well"}
--- a/core/dm.py
+++ b/core/dm.py
@ -1,47 +0,0 @@
-import asyncio
-import websockets
-import concurrent.futures
-import json
-from insights import pipeline
-
-
-async def get_public_msg():
-    uri = "ws://127.0.0.1:8066/ws/publicMsg"
-    reconnect_attempts = 0
-    max_reconnect_attempts = 3  # 可以根据需要设置最大重连次数
-
-    while True:
-        try:
-            async with websockets.connect(uri, max_size=10 * 1024 * 1024) as websocket:
-                loop = asyncio.get_running_loop()
-                with concurrent.futures.ThreadPoolExecutor() as pool:
-                    while True:
-                        response = await websocket.recv()
-                        datas = json.loads(response)
-                        for data in datas["data"]:
-                            if data["IsSender"] != "0":
-                                print('self-send message, pass')
-                                print(data)
-                                continue
-                            input_data = {
-                                "user_id": data["StrTalker"],
-                                "type": "publicMsg",
-                                "content": data["Content"],
-                                "addition": data["MsgSvrID"]
-                            }
-                            await loop.run_in_executor(pool, pipeline, input_data)
-        except websockets.exceptions.ConnectionClosedError as e:
-            print(f"Connection closed with exception: {e}")
-            reconnect_attempts += 1
-            if reconnect_attempts <= max_reconnect_attempts:
-                print(f"Reconnecting attempt {reconnect_attempts}...")
-                await asyncio.sleep(5)  # 等待一段时间后重试
-            else:
-                print("Max reconnect attempts reached. Exiting.")
-                break
-        except Exception as e:
-            print(f"An unexpected error occurred: {e}")
-            break
-
-# 使用asyncio事件循环运行get_public_msg coroutine
-asyncio.run(get_public_msg())
--- a/core/docker_entrypoint.sh
+++ b/core/docker_entrypoint.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+set -o allexport
+source ../.env
+set +o allexport
+uvicorn backend:app --reload --host localhost --port 8077
+#exec uvicorn backend:app --reload --host localhost --port 8077 &
+#exec python background_task.py
--- a/core/insights/init.py
+++ b/core/insights/init.py
@ -1,12 +1,13 @@
-from ..scrapers import *
-from ..utils.general_utils import extract_urls, compare_phrase_with_list
+# -*- coding: utf-8 -*-
+
+from scrapers import *
+from utils.general_utils import extract_urls, compare_phrase_with_list
 from .get_info import get_info, pb, project_dir, logger, info_rewrite
 import os
 import json
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
 import re
-import time


 # The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
@ -18,11 +19,49 @@ expiration_days = 3
 existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]


-def pipeline(_input: dict):
+async def get_articles(urls: list[str], expiration: datetime, cache: dict = {}) -> list[dict]:
+    articles = []
+    for url in urls:
+        logger.debug(f"fetching {url}")
+
+        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
+            flag, result = await mp_crawler(url, logger)
+        else:
+            flag, result = await simple_crawler(url, logger)
+
+        if flag == -7:
+            #  -7 means cannot fetch the html, and other crawlers have no effect.
+            continue
+
+        if flag != 11:
+            flag, result = await llm_crawler(url, logger)
+            if flag != 11:
+                continue
+
+        expiration_date = expiration.strftime('%Y-%m-%d')
+        article_date = int(result['publish_time'])
+        if article_date < int(expiration_date.replace('-', '')):
+            logger.info(f"publish date is {article_date}, too old, skip")
+            continue
+
+        if url in cache:
+            for k, v in cache[url].items():
+                if v:
+                    result[k] = v
+
+        articles.append(result)
+
+    return articles
+
+
+async def pipeline(_input: dict):
    cache = {}
    source = _input['user_id'].split('@')[-1]
    logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")

+    global existing_urls
+    expiration_date = datetime.now() - timedelta(days=expiration_days)
+
    if _input['type'] == 'publicMsg':
        items = item_pattern.findall(_input["content"])
        # Iterate through all < item > content, extracting < url > and < summary >
@ -37,73 +76,57 @@ def pipeline(_input: dict):
            cut_off_point = url.find('chksm=')
            if cut_off_point != -1:
                url = url[:cut_off_point-1]
+            if url in existing_urls:
+                logger.debug(f"{url} has been crawled, skip")
+                continue
            if url in cache:
                logger.debug(f"{url} already find in item")
                continue
            summary_match = summary_pattern.search(item)
            summary = summary_match.group(1) if summary_match else None
-            cache[url] = summary
-        urls = list(cache.keys())
+            cache[url] = {'source': source, 'abstract': summary}
+        articles = await get_articles(list(cache.keys()), expiration_date, cache)
+
+    elif _input['type'] == 'site':
+        # for the site url, Usually an article list page or a website homepage
+        # need to get the article list page
+        # You can use a general scraper, or you can customize a site-specific crawler, see scrapers/README_CN.md
+        urls = extract_urls(_input['content'])
+        if not urls:
+            logger.debug(f"can not find any url in\n{_input['content']}")
+            return
+        articles = []
+        for url in urls:
+            parsed_url = urlparse(url)
+            domain = parsed_url.netloc
+            if domain in scraper_map:
+                result = scraper_map[domain](url, logger)
+            else:
+                result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
+            articles.extend(result)

    elif _input['type'] == 'text':
        urls = extract_urls(_input['content'])
        if not urls:
            logger.debug(f"can not find any url in\n{_input['content']}\npass...")
            return
+        articles = await get_articles(urls, expiration_date)
+
    elif _input['type'] == 'url':
-        urls = []
-        pass
+        # this is remained for wechat shared mp_article_card
+        # todo will do it in project awada (need finish the generalMsg api first)
+        articles = []
    else:
        return

-    global existing_urls
-
-    for url in urls:
-        if url in existing_urls:
-            logger.debug(f"{url} has been crawled, skip")
+    for article in articles:
+        if article['url'] in existing_urls:
+            # For the case of entering multiple sites at the same time,
+            # there is indeed a situation where duplicate articles are mixed into the same batch
+            logger.debug(f"{article['url']} duplicated, skip")
            continue

-        logger.debug(f"fetching {url}")
-        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
-            flag, article = mp_crawler(url, logger)
-            if flag == -7:
-                # For mp crawlers, the high probability of -7 is limited by WeChat, just wait 1min.
-                logger.info(f"fetch {url} failed, try to wait 1min and try again")
-                time.sleep(60)
-                flag, article = mp_crawler(url, logger)
-        else:
-            parsed_url = urlparse(url)
-            domain = parsed_url.netloc
-            if domain in scraper_map:
-                flag, article = scraper_map[domain](url, logger)
-            else:
-                flag, article = simple_crawler(url, logger)
-
-        if flag == -7:
-            #  -7 means that the network is different, and other crawlers have no effect.
-            logger.info(f"cannot fetch {url}")
-            continue
-
-        if flag != 11:
-            logger.info(f"{url} failed with mp_crawler and simple_crawler")
-            flag, article = llm_crawler(url, logger)
-            if flag != 11:
-                logger.info(f"{url} failed with llm_crawler")
-                continue
-
-        expiration_date = datetime.now() - timedelta(days=expiration_days)
-        expiration_date = expiration_date.strftime('%Y-%m-%d')
-        article_date = int(article['publish_time'])
-        if article_date < int(expiration_date.replace('-', '')):
-            logger.info(f"publish date is {article_date}, too old, skip")
-            continue
-
-        article['source'] = source
-        if cache[url]:
-            article['abstract'] = cache[url]
-
        insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
-
        try:
            article_id = pb.add(collection_name='articles', body=article)
        except Exception as e:
@ -112,7 +135,7 @@ def pipeline(_input: dict):
                json.dump(article, f, ensure_ascii=False, indent=4)
            continue

-        existing_urls.append(url)
+        existing_urls.append(article['url'])

        if not insights:
            continue
--- a/core/insights/get_info.py
+++ b/core/insights/get_info.py
@ -1,9 +1,9 @@
-from ..llms.openai_wrapper import openai_llm
-# from ..llms.siliconflow_wrapper import sfa_llm
+from llms.openai_wrapper import openai_llm
+# from llms.siliconflow_wrapper import sfa_llm
 import re
-from ..utils.general_utils import get_logger_level
+from utils.general_utils import get_logger_level
 from loguru import logger
-from ..utils.pb_api import PbTalker
+from utils.pb_api import PbTalker
 import os
 import locale

--- a/core/pb/README.md
+++ b/core/pb/README.md
@ -1,13 +1,11 @@
 # for developer

-如果你只是用户，无需关注这个文件夹。
-
-对于python开发者，请使用  backend/pb_api.py 模块进行数据库操作
-
-对于js开发者，可以直接启动数据库后，在数据库各个collection页面中的api详情查看接口说明
+download https://pocketbase.io/docs/

 ```bash
 cd pb
-./pocketbase --dev admin create test@example.com 123467890 #如果没有初始账号，请用这个命令创建
+xattr -d com.apple.quarantine pocketbase # for Macos
+./pocketbase migrate up # for first run
+./pocketbase --dev admin create test@example.com 123467890 # If you don't have an initial account, please use this command to create it
 ./pocketbase serve
 ```
--- a/core/pb/pb_migrations/1713321985_created_roleplays.js
+++ b/core/pb/pb_migrations/1713321985_created_roleplays.js
@ -1,135 +0,0 @@
-/// <reference path="../pb_data/types.d.ts" />
-migrate((db) => {
-  const collection = new Collection({
-    "id": "4rpge043645sp4j",
-    "created": "2024-04-17 02:46:25.373Z",
-    "updated": "2024-04-17 02:46:25.373Z",
-    "name": "roleplays",
-    "type": "base",
-    "system": false,
-    "schema": [
-      {
-        "system": false,
-        "id": "ixk4pwsb",
-        "name": "activated",
-        "type": "bool",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {}
-      },
-      {
-        "system": false,
-        "id": "tmak73c7",
-        "name": "character",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "6iuxuwhb",
-        "name": "focus",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "axmc2huy",
-        "name": "focus_type",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "gop61pjt",
-        "name": "good_sample1",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "qmy5cofa",
-        "name": "good_sample2",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "h8gafaci",
-        "name": "bad_sample",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      },
-      {
-        "system": false,
-        "id": "m2ug5sfd",
-        "name": "report_type",
-        "type": "text",
-        "required": false,
-        "presentable": false,
-        "unique": false,
-        "options": {
-          "min": null,
-          "max": null,
-          "pattern": ""
-        }
-      }
-    ],
-    "indexes": [],
-    "listRule": null,
-    "viewRule": null,
-    "createRule": null,
-    "updateRule": null,
-    "deleteRule": null,
-    "options": {}
-  });
-
-  return Dao(db).saveCollection(collection);
-}, (db) => {
-  const dao = new Dao(db);
-  const collection = dao.findCollectionByNameOrId("4rpge043645sp4j");
-
-  return dao.deleteCollection(collection);
-})
--- a/core/requirements.txt
+++ b/core/requirements.txt
@ -5,4 +5,6 @@ gne
 jieba
 httpx
 chardet
-websockets
+pocketbase
+pydantic
+uvicorn
--- a/core/scrapers/README.md
+++ b/core/scrapers/README.md
@ -1,33 +1,33 @@
-**这个文件夹下可以放置对应特定信源的爬虫，注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
-
-# 专有爬虫配置
-
-写好爬虫后，将爬虫程序放在这个文件夹，并在__init__.py下的scraper_map中注册爬虫，类似：
-
-```python
-{'www.securityaffairs.com': securityaffairs_scraper}
-```
-
-其中key就是信源地址，value是函数名
-
-爬虫应该写为函数形式，出入参约定为：
-
-输入：
- expiration： datetime的date.date()对象，爬虫应该只抓取这之后（含这一天）的文章
- existings：[str], 数据库已有文章的url列表，爬虫应该忽略这个列表里面的url
-
-输出：
- [dict]，返回结果列表，每个dict代表一个文章，格式如下：
-`[{'url': str, 'title':  str, 'author':  str,  'publish_time':  str, 'content':  str, 'abstract':  str, 'images': [Path]}, {...},  ...]`
-
-注意：publish_time格式为`"%Y%m%d"`， 如果爬虫抓不到可以用当天日期
-
-另外，title和content是必须要有的
-
-# 通用页面解析器
-
-我们这里提供了一个通用页面解析器，该解析器可以智能获取信源文章列表，接下来对于每一个文章url，会先尝试使用 gne 进行解析，如果失败的话，再尝试使用llm进行解析。
-
-通过这个方案，可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
-
-**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务，以实现更加理想且更加高效的扫描。**
+> **This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
+> 
+> # Custom Crawler Configuration
+> 
+> After writing the crawler, place the crawler program in this folder and register it in the scraper_map in `__init__.py`, similar to:
+> 
+> ```python
+> {'www.securityaffairs.com': securityaffairs_scraper}
+> ```
+> 
+> Here, the key is the source URL, and the value is the function name.
+> 
+> The crawler should be written in the form of a function with the following input and output specifications:
+> 
+> Input:
+> - expiration: A `datetime.date` object, the crawler should only fetch articles on or after this date.
+> - existings: [str], a list of URLs of articles already in the database. The crawler should ignore the URLs in this list.
+> 
+> Output:
+> - [dict], a list of result dictionaries, each representing an article, formatted as follows:
+> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
+> 
+> Note: The format of `publish_time` should be `"%Y%m%d"`. If the crawler cannot fetch it, the current date can be used.
+> 
+> Additionally, `title` and `content` are mandatory fields.
+> 
+> # Generic Page Parser
+> 
+> We provide a generic page parser here, which can intelligently fetch article lists from the source. For each article URL, it will first attempt to parse using gne. If it fails, it will then attempt to parse using llm.
+> 
+> Through this solution, it is possible to scan and extract information from most general news and portal sources.
+> 
+> **However, we still strongly recommend that users write custom crawlers themselves or directly subscribe to our data service for more ideal and efficient scanning.**
--- a/core/scrapers/README_CN.md
+++ b/core/scrapers/README_CN.md
@ -0,0 +1,33 @@
+**这个文件夹下可以放置对应特定信源的爬虫，注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
+
+# 专有爬虫配置
+
+写好爬虫后，将爬虫程序放在这个文件夹，并在__init__.py下的scraper_map中注册爬虫，类似：
+
+```python
+{'www.securityaffairs.com': securityaffairs_scraper}
+```
+
+其中key就是信源地址，value是函数名
+
+爬虫应该写为函数形式，出入参约定为：
+
+输入：
+- expiration： datetime的date.date()对象，爬虫应该只抓取这之后（含这一天）的文章
+- existings：[str], 数据库已有文章的url列表，爬虫应该忽略这个列表里面的url
+
+输出：
+- [dict]，返回结果列表，每个dict代表一个文章，格式如下：
+`[{'url': str, 'title':  str, 'author':  str,  'publish_time':  str, 'content':  str, 'abstract':  str, 'images': [Path]}, {...},  ...]`
+
+注意：publish_time格式为`"%Y%m%d"`， 如果爬虫抓不到可以用当天日期
+
+另外，title和content是必须要有的
+
+# 通用页面解析器
+
+我们这里提供了一个通用页面解析器，该解析器可以智能获取信源文章列表，接下来对于每一个文章url，会先尝试使用 gne 进行解析，如果失败的话，再尝试使用llm进行解析。
+
+通过这个方案，可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
+
+**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务，以实现更加理想且更加高效的扫描。**
--- a/core/scrapers/README_de.md
+++ b/core/scrapers/README_de.md
@ -0,0 +1,34 @@
+
+> **In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
+> 
+> # Konfiguration des benutzerdefinierten Crawlers
+> 
+> Nachdem Sie den Crawler geschrieben haben, platzieren Sie das Crawler-Programm in diesem Ordner und registrieren Sie es in scraper_map in `__init__.py`, ähnlich wie:
+> 
+> ```python
+> {'www.securityaffairs.com': securityaffairs_scraper}
+> ```
+> 
+> Hier ist der Schlüssel die URL der Quelle und der Wert der Funktionsname.
+> 
+> Der Crawler sollte in Form einer Funktion geschrieben werden, mit den folgenden Eingabe- und Ausgabeparametern:
+> 
+> Eingabe:
+> - expiration: Ein `datetime.date` Objekt, der Crawler sollte nur Artikel ab diesem Datum (einschließlich) abrufen.
+> - existings: [str], eine Liste von URLs von Artikeln, die bereits in der Datenbank vorhanden sind. Der Crawler sollte die URLs in dieser Liste ignorieren.
+> 
+> Ausgabe:
+> - [dict], eine Liste von Ergebnis-Wörterbüchern, wobei jedes Wörterbuch einen Artikel darstellt, formatiert wie folgt:
+> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
+> 
+> Hinweis: Das Format von `publish_time` sollte `"%Y%m%d"` sein. Wenn der Crawler es nicht abrufen kann, kann das aktuelle Datum verwendet werden.
+> 
+> Darüber hinaus sind `title` und `content` Pflichtfelder.
+> 
+> # Generischer Seitenparser
+> 
+> Wir bieten hier einen generischen Seitenparser an, der intelligent Artikellisten von der Quelle abrufen kann. Für jede Artikel-URL wird zunächst versucht, mit gne zu parsen. Scheitert dies, wird versucht, mit llm zu parsen.
+> 
+> Durch diese Lösung ist es möglich, die meisten allgemeinen Nachrichtenquellen und Portale zu scannen und Informationen zu extrahieren.
+> 
+> **Wir empfehlen jedoch dringend, dass Benutzer eigene benutzerdefinierte Crawlers schreiben oder direkt unseren Datenservice abonnieren, um eine idealere und effizientere Erfassung zu erreichen.**
--- a/core/scrapers/README_fr.md
+++ b/core/scrapers/README_fr.md
@ -0,0 +1,34 @@
+
+> **Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
+> 
+> # Configuration du Crawler Personnalisé
+> 
+> Après avoir écrit le crawler, placez le programme du crawler dans ce dossier et enregistrez-le dans scraper_map dans `__init__.py`, comme suit :
+> 
+> ```python
+> {'www.securityaffairs.com': securityaffairs_scraper}
+> ```
+> 
+> Ici, la clé est l'URL de la source, et la valeur est le nom de la fonction.
+> 
+> Le crawler doit être écrit sous forme de fonction avec les spécifications suivantes pour les entrées et sorties :
+> 
+> Entrée :
+> - expiration : Un objet `datetime.date`, le crawler ne doit récupérer que les articles à partir de cette date (incluse).
+> - existings : [str], une liste d'URLs d'articles déjà présents dans la base de données. Le crawler doit ignorer les URLs de cette liste.
+> 
+> Sortie :
+> - [dict], une liste de dictionnaires de résultats, chaque dictionnaire représentant un article, formaté comme suit :
+> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
+> 
+> Remarque : Le format de `publish_time` doit être `"%Y%m%d"`. Si le crawler ne peut pas le récupérer, la date du jour peut être utilisée.
+> 
+> De plus, `title` et `content` sont des champs obligatoires.
+> 
+> # Analyseur de Page Générique
+> 
+> Nous fournissons ici un analyseur de page générique, qui peut récupérer intelligemment les listes d'articles de la source. Pour chaque URL d'article, il tentera d'abord de parser avec gne. En cas d'échec, il tentera de parser avec llm.
+> 
+> Grâce à cette solution, il est possible de scanner et d'extraire des informations à partir de la plupart des sources de type actualités générales et portails.
+> 
+> **Cependant, nous recommandons vivement aux utilisateurs de rédiger eux-mêmes des crawlers personnalisés ou de s'abonner directement à notre service de données pour un scan plus idéal et plus efficace.**
--- a/core/scrapers/README_jp.md
+++ b/core/scrapers/README_jp.md
@ -0,0 +1,33 @@
+**このフォルダには特定のソースに対応したクローラーを配置できます。ここでのクローラーはソースの記事リストURLを解析し、記事の詳細情報を辞書形式で返す必要があります。**
+> 
+> # カスタムクローラーの設定
+> 
+> クローラーを作成した後、そのプログラムをこのフォルダに配置し、`__init__.py` の scraper_map に次のように登録します：
+> 
+> ```python
+> {'www.securityaffairs.com': securityaffairs_scraper}
+> ```
+> 
+> ここで、キーはソースのURLで、値は関数名です。
+> 
+> クローラーは関数形式で記述し、以下の入力および出力仕様を満たす必要があります：
+> 
+> 入力：
+> - expiration： `datetime.date` オブジェクト、クローラーはこの日付以降（この日を含む）の記事のみを取得する必要があります。
+> - existings：[str]、データベースに既存する記事のURLリスト、クローラーはこのリスト内のURLを無視する必要があります。
+> 
+> 出力：
+> - [dict]、結果の辞書リスト、各辞書は以下の形式で1つの記事を表します：
+> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
+> 
+> 注意：`publish_time`の形式は`"%Y%m%d"`である必要があります。クローラーで取得できない場合は、当日の日付を使用できます。
+> 
+> さらに、`title`と`content`は必須フィールドです。
+> 
+> # 一般ページパーサー
+> 
+> ここでは一般的なページパーサーを提供しており、ソースから記事リストをインテリジェントに取得できます。各記事URLに対して、最初に gne を使用して解析を試みます。失敗した場合は、llm を使用して解析を試みます。
+> 
+> このソリューションにより、ほとんどの一般的なニュースおよびポータルソースのスキャンと情報抽出が可能になります。
+> 
+> **しかし、より理想的かつ効率的なスキャンを実現するために、ユーザー自身でカスタムクローラーを作成するか、直接弊社のデータサービスを購読することを強くお勧めします。**
--- a/core/scrapers/init.py
+++ b/core/scrapers/init.py
@ -1,6 +1,6 @@
 from .mp_crawler import mp_crawler
 from .simple_crawler import simple_crawler
-from .general_scraper import llm_crawler
+from .general_scraper import general_scraper, llm_crawler


 scraper_map = {}
--- a/core/scrapers/general_scraper.py
+++ b/core/scrapers/general_scraper.py
@ -1,16 +1,20 @@
+# -*- coding: utf-8 -*-
+
 import os
 from urllib.parse import urlparse
 import re
 from .simple_crawler import simple_crawler
+from .mp_crawler import mp_crawler
 import httpx
 from bs4 import BeautifulSoup
 from bs4.element import Comment
-from ..llms.openai_wrapper import openai_llm
-# from ..llms.siliconflow_wrapper import sfa_llm
+from llms.openai_wrapper import openai_llm
+# from llms.siliconflow_wrapper import sfa_llm
 from datetime import datetime, date
 from requests.compat import urljoin
 import chardet
-from ..utils.general_utils import extract_and_convert_dates
+from utils.general_utils import extract_and_convert_dates
+import asyncio


 model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
@ -63,7 +67,6 @@ def parse_html_content(out: str) -> dict:
    return dct


-
 sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
 """
 Title||Summary||Content||Release Date YYYY-MM-DD
@ -71,30 +74,38 @@ Title||Summary||Content||Release Date YYYY-MM-DD
 '''


-def llm_crawler(url: str, logger) -> (int, dict):
-    try:
-        with httpx.Client() as client:
-            response = client.get(url, headers=header, timeout=30)
-            rawdata = response.content
-            encoding = chardet.detect(rawdata)['encoding']
-            text = rawdata.decode(encoding)
-    except Exception as e:
-        logger.error(e)
-        return -7, {}
+async def llm_crawler(url: str, logger) -> (int, dict):
+    async with httpx.AsyncClient() as client:
+        for retry in range(2):
+            try:
+                response = await client.get(url, headers=header, timeout=30)
+                response.raise_for_status()
+                break
+            except Exception as e:
+                if retry < 1:
+                    logger.info(f"request {url} got error {e}\nwaiting 1min")
+                    await asyncio.sleep(60)
+                else:
+                    logger.warning(f"request {url} got error {e}")
+                    return -7, {}

-    soup = BeautifulSoup(text, "html.parser")
-    html_text = text_from_soup(soup)
-    html_lines = html_text.split('\n')
-    html_lines = [line.strip() for line in html_lines if line.strip()]
-    html_text = "\n".join(html_lines)
-    if len(html_text) > 29999:
-        logger.warning(f"{url} content too long for llm parsing")
-        return 0, {}
+        rawdata = response.content
+        encoding = chardet.detect(rawdata)['encoding']
+        text = rawdata.decode(encoding, errors='replace')
+        soup = BeautifulSoup(text, "html.parser")
+        html_text = text_from_soup(soup)
+        html_lines = html_text.split('\n')
+        html_lines = [line.strip() for line in html_lines if line.strip()]
+        html_text = "\n".join(html_lines)
+        if len(html_text) > 29999:
+            logger.warning(f"{url} content too long for llm parsing")
+            return 0, {}

-    if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
-            or html_text.startswith('出错了'):
-        logger.warning(f"can not get {url} from the Internet")
-        return -7, {}
+        if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
+                '您访问的页面') or html_text.startswith('403') \
+                or html_text.startswith('出错了'):
+            logger.warning(f"can not get {url} from the Internet")
+            return -7, {}

    messages = [
        {"role": "system", "content": sys_info},
@ -103,7 +114,7 @@ def llm_crawler(url: str, logger) -> (int, dict):
    llm_output = openai_llm(messages, model=model, logger=logger)
    try:
        info = parse_html_content(llm_output)
-    except Exception:
+    except:
        msg = f"can not parse {llm_output}"
        logger.debug(msg)
        return 0, {}
@ -146,31 +157,49 @@ def llm_crawler(url: str, logger) -> (int, dict):
    return 11, info


-def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
-    try:
-        with httpx.Client() as client:
-            response = client.get(site, headers=header, timeout=30)
-    except Exception as e:
-        logger.error(e)
-        return []
+async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
+    async with httpx.AsyncClient() as client:
+        for retry in range(2):
+            try:
+                response = await client.get(site, headers=header, timeout=30)
+                response.raise_for_status()
+                break
+            except Exception as e:
+                if retry < 1:
+                    logger.info(f"request {site} got error {e}\nwaiting 1min")
+                    await asyncio.sleep(60)
+                else:
+                    logger.warning(f"request {site} got error {e}")
+                    return []
+
+        page_source = response.text
+        soup = BeautifulSoup(page_source, "html.parser")
+        # Parse all URLs
+        parsed_url = urlparse(site)
+        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+        urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]

-    page_source = response.text
-    soup = BeautifulSoup(page_source, "html.parser")
-    # Parse all URLs
-    parsed_url = urlparse(site)
-    base_url = parsed_url.scheme + '://' + parsed_url.netloc
-    urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
    if not urls:
        # maybe it's an article site
        logger.warning(f"can not find any link from {site}, maybe it's an article site...")
        if site in existing:
            logger.debug(f"{site} has been crawled before, skip it")
            return []
-        flag, result = simple_crawler(site, logger)
+
+        if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):
+            flag, result = await mp_crawler(site, logger)
+        else:
+            flag, result = await simple_crawler(site, logger)
+
+        if flag == -7:
+            #  -7 means cannot fetch the html, and other crawlers have no effect.
+            return []
+
        if flag != 11:
-            flag, result = llm_crawler(site, logger)
+            flag, result = await llm_crawler(site, logger)
            if flag != 11:
                return []
+
        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
            logger.debug(f"{site} is too old, skip it")
@ -183,12 +212,23 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger) ->
        if url in existing:
            logger.debug(f"{url} has been crawled before, skip it")
            continue
+
        existing.append(url)
-        flag, result = simple_crawler(url, logger)
+
+        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
+            flag, result = await mp_crawler(url, logger)
+        else:
+            flag, result = await simple_crawler(url, logger)
+
+        if flag == -7:
+            #  -7 means cannot fetch the html, and other crawlers have no effect.
+            continue
+
        if flag != 11:
-            flag, result = llm_crawler(url, logger)
+            flag, result = await llm_crawler(url, logger)
            if flag != 11:
                continue
+
        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
            logger.debug(f"{url} is too old, skip it")
--- a/core/scrapers/mp_crawler.py
+++ b/core/scrapers/mp_crawler.py
@ -1,102 +1,112 @@
+# -*- coding: utf-8 -*-
+
 import httpx
 from bs4 import BeautifulSoup
 from datetime import datetime
 import re
+import asyncio


 header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


-def mp_crawler(url: str, logger) -> (int, dict):
+async def mp_crawler(url: str, logger) -> (int, dict):
    if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
        logger.warning(f'{url} is not a mp url, you should not use this function')
        return -5, {}

    url = url.replace("http://", "https://", 1)

-    try:
-        with httpx.Client() as client:
-            response = client.get(url, headers=header, timeout=30)
-    except Exception as e:
-        logger.warning(f"cannot get content from {url}\n{e}")
-        return -7, {}
+    async with httpx.AsyncClient() as client:
+        for retry in range(2):
+            try:
+                response = await client.get(url, headers=header, timeout=30)
+                response.raise_for_status()
+                break
+            except Exception as e:
+                if retry < 1:
+                    logger.info(f"request {url} got error {e}\nwaiting 1min")
+                    await asyncio.sleep(60)
+                else:
+                    logger.warning(f"request {url} got error {e}")
+                    return -7, {}

-    soup = BeautifulSoup(response.text, 'html.parser')
+        soup = BeautifulSoup(response.text, 'html.parser')

-    # Get the original release date first
-    pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
-    match = re.search(pattern, response.text)
+        # Get the original release date first
+        pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
+        match = re.search(pattern, response.text)

-    if match:
-        date_only = match.group(1)
-        publish_time = date_only.replace('-', '')
-    else:
-        publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
+        if match:
+            date_only = match.group(1)
+            publish_time = date_only.replace('-', '')
+        else:
+            publish_time = datetime.strftime(datetime.today(), "%Y%m%d")

-    # Get description content from < meta > tag
-    try:
-        meta_description = soup.find('meta', attrs={'name': 'description'})
-        summary = meta_description['content'].strip() if meta_description else ''
-        card_info = soup.find('div', id='img-content')
-        # Parse the required content from the < div > tag
-        rich_media_title = soup.find('h1', id='activity-name').text.strip() \
-            if soup.find('h1', id='activity-name') \
-            else soup.find('h1', class_='rich_media_title').text.strip()
-        profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
-            if card_info \
-            else soup.find('div', class_='wx_follow_nickname').text.strip()
-    except Exception as e:
-        logger.warning(f"not mp format: {url}\n{e}")
-        return -7, {}
+        # Get description content from < meta > tag
+        try:
+            meta_description = soup.find('meta', attrs={'name': 'description'})
+            summary = meta_description['content'].strip() if meta_description else ''
+            card_info = soup.find('div', id='img-content')
+            # Parse the required content from the < div > tag
+            rich_media_title = soup.find('h1', id='activity-name').text.strip() \
+                if soup.find('h1', id='activity-name') \
+                else soup.find('h1', class_='rich_media_title').text.strip()
+            profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
+                if card_info \
+                else soup.find('div', class_='wx_follow_nickname').text.strip()
+        except Exception as e:
+            logger.warning(f"not mp format: {url}\n{e}")
+            # For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
+            return -7, {}

-    if not rich_media_title or not profile_nickname:
-        logger.warning(f"failed to analysis {url}, no title or profile_nickname")
-        # For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
-        return -7, {}
+        if not rich_media_title or not profile_nickname:
+            logger.warning(f"failed to analysis {url}, no title or profile_nickname")
+            return -7, {}

-    # Parse text and image links within the content interval
-    # Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
-    # because the structure of this part is completely different, and a separate analysis scheme needs to be written
-    # (but the proportion of this type of article is not high).
-    texts = []
-    images = set()
-    content_area = soup.find('div', id='js_content')
-    if content_area:
-        # 提取文本
-        for section in content_area.find_all(['section', 'p'], recursive=False):  # 遍历顶级section
-            text = section.get_text(separator=' ', strip=True)
-            if text and text not in texts:
-                texts.append(text)
+        # Parse text and image links within the content interval
+        # Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
+        # because the structure of this part is completely different, and a separate analysis scheme needs to be written
+        # (but the proportion of this type of article is not high).
+        texts = []
+        images = set()
+        content_area = soup.find('div', id='js_content')
+        if content_area:
+            # 提取文本
+            for section in content_area.find_all(['section', 'p'], recursive=False):  # 遍历顶级section
+                text = section.get_text(separator=' ', strip=True)
+                if text and text not in texts:
+                    texts.append(text)

-        for img in content_area.find_all('img', class_='rich_pages wxw-img'):
-            img_src = img.get('data-src') or img.get('src')
-            if img_src:
-                images.add(img_src)
-        cleaned_texts = [t for t in texts if t.strip()]
-        content = '\n'.join(cleaned_texts)
-    else:
-        logger.warning(f"failed to analysis contents {url}")
-        return 0, {}
-    if content:
-        content = f"[from {profile_nickname}]{content}"
-    else:
-        # If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
-        # At this time, you can use the summary as the content.
-        content = f"[from {profile_nickname}]{summary}"
+            for img in content_area.find_all('img', class_='rich_pages wxw-img'):
+                img_src = img.get('data-src') or img.get('src')
+                if img_src:
+                    images.add(img_src)
+            cleaned_texts = [t for t in texts if t.strip()]
+            content = '\n'.join(cleaned_texts)
+        else:
+            logger.warning(f"failed to analysis contents {url}")
+            return 0, {}
+        if content:
+            content = f"[from {profile_nickname}]{content}"
+        else:
+            # If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
+            # At this time, you can use the summary as the content.
+            content = f"[from {profile_nickname}]{summary}"

-    # Get links to images in meta property = "og: image" and meta property = "twitter: image"
-    og_image = soup.find('meta', property='og:image')
-    twitter_image = soup.find('meta', property='twitter:image')
-    if og_image:
-        images.add(og_image['content'])
-    if twitter_image:
-        images.add(twitter_image['content'])
+        # Get links to images in meta property = "og: image" and meta property = "twitter: image"
+        og_image = soup.find('meta', property='og:image')
+        twitter_image = soup.find('meta', property='twitter:image')
+        if og_image:
+            images.add(og_image['content'])
+        if twitter_image:
+            images.add(twitter_image['content'])

-    if rich_media_title == summary or not summary:
-        abstract = ''
-    else:
-        abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}"
+        if rich_media_title == summary or not summary:
+            abstract = ''
+        else:
+            abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}"

    return 11, {
        'title': rich_media_title,
--- a/core/scrapers/simple_crawler.py
+++ b/core/scrapers/simple_crawler.py
@ -1,10 +1,13 @@
+# -*- coding: utf-8 -*-
+
 from gne import GeneralNewsExtractor
 import httpx
 from bs4 import BeautifulSoup
 from datetime import datetime
-from ..utils.general_utils import extract_and_convert_dates
+from utils.general_utils import extract_and_convert_dates
 import chardet
 from urllib.parse import urlparse
+import asyncio


 extractor = GeneralNewsExtractor()
@ -12,54 +15,63 @@ header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


-def simple_crawler(url: str, logger) -> (int, dict):
+async def simple_crawler(url: str, logger) -> (int, dict):
    """
    Return article information dict and flag, negative number is error, 0 is no result, 11 is success
    """
-    try:
-        with httpx.Client() as client:
-            response = client.get(url, headers=header, timeout=30)
-            rawdata = response.content
-            encoding = chardet.detect(rawdata)['encoding']
-            text = rawdata.decode(encoding)
+    async with httpx.AsyncClient() as client:
+        for retry in range(2):
+            try:
+                response = await client.get(url, headers=header, timeout=30)
+                response.raise_for_status()
+                break
+            except Exception as e:
+                if retry < 1:
+                    logger.info(f"request {url} got error {e}\nwaiting 1min")
+                    await asyncio.sleep(60)
+                else:
+                    logger.warning(f"request {url} got error {e}")
+                    return -7, {}
+
+        rawdata = response.content
+        encoding = chardet.detect(rawdata)['encoding']
+        text = rawdata.decode(encoding, errors='replace')
        result = extractor.extract(text)
-    except Exception as e:
-        logger.warning(f"cannot get content from {url}\n{e}")
-        return -7, {}
+        if not result:
+            logger.error(f"gne cannot extract {url}")
+            return 0, {}

-    if not result:
-        logger.error(f"gne cannot extract {url}")
-        return 0, {}
+        if len(result['title']) < 4 or len(result['content']) < 24:
+            logger.info(f"{result} not valid")
+            return 0, {}

-    if len(result['title']) < 4 or len(result['content']) < 24:
-        logger.info(f"{result} not valid")
-        return 0, {}
+        if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
+            'title'].startswith('403') \
+                or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
+            logger.warning(f"can not get {url} from the Internet")
+            return -7, {}

-    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
-            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
-        logger.warning(f"can not get {url} from the Internet")
-        return -7, {}
-
-    date_str = extract_and_convert_dates(result['publish_time'])
-    if date_str:
-        result['publish_time'] = date_str
-    else:
-        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
-
-    from_site = urlparse(url).netloc
-    from_site = from_site.replace('www.', '')
-    from_site = from_site.split('.')[0]
-    result['content'] = f"[from {from_site}] {result['content']}"
-
-    soup = BeautifulSoup(text, "html.parser")
-    try:
-        meta_description = soup.find("meta", {"name": "description"})
-        if meta_description:
-            result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
+        date_str = extract_and_convert_dates(result['publish_time'])
+        if date_str:
+            result['publish_time'] = date_str
        else:
-            result['abstract'] = ''
-    except Exception:
-        result['abstract'] = ''
+            result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
+
+        from_site = urlparse(url).netloc
+        from_site = from_site.replace('www.', '')
+        from_site = from_site.split('.')[0]
+        result['content'] = f"[from {from_site}] {result['content']}"
+
+        soup = BeautifulSoup(text, "html.parser")
+        try:
+            meta_description = soup.find("meta", {"name": "description"})
+            if meta_description:
+                result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
+            else:
+                result['abstract'] = ''
+        except:
+            result['abstract'] = ''
+
+        result['url'] = url

-    result['url'] = url
    return 11, result
--- a/core/utils/general_utils.py
+++ b/core/utils/general_utils.py
@ -1,9 +1,4 @@
-"""mostly copy from https://github.com/netease-youdao/QAnything
-awsome work!
-"""
-# import traceback
 from urllib.parse import urlparse
-import time
 import os
 import re
 import jieba
@ -18,131 +13,39 @@ def extract_urls(text):
    url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
    urls = re.findall(url_pattern, text)

-    # 过滤掉那些只匹配到 'www.' 而没有后续内容的情况，并尝试为每个URL添加默认的http协议前缀以便解析
+    # Filter out those cases that only match to'www. 'without subsequent content,
+    # and try to add the default http protocol prefix to each URL for easy parsing
    cleaned_urls = [url for url in urls if isURL(url)]
    return cleaned_urls


 def isChinesePunctuation(char):
-    # 定义中文标点符号的Unicode编码范围
+    # Define the Unicode encoding range for Chinese punctuation marks
    chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
-    # 检查字符是否在上述范围内
+    # Check if the character is within the above range
    return ord(char) in chinese_punctuations


-def get_time(func):
-    def inner(*arg, **kwargs):
-        s_time = time.time()
-        res = func(*arg, **kwargs)
-        e_time = time.time()
-        print('函数 {} 执行耗时: {} 秒'.format(func.__name__, e_time - s_time))
-        return res
-    return inner
-
-
-'''
-def safe_get(req: Request, attr: str, default=None):
-    try:
-        if attr in req.form:
-            return req.form.getlist(attr)[0]
-        if attr in req.args:
-            return req.args[attr]
-        if attr in req.json:
-            return req.json[attr]
-        # if value := req.form.get(attr):
-        #     return value
-        # if value := req.args.get(attr):
-        #     return value
-        # """req.json执行时不校验content-type，body字段可能不能被正确解析为json"""
-        # if value := req.json.get(attr):
-        #     return value
-    except BadRequest:
-        logging.warning(f"missing {attr} in request")
-    except Exception as e:
-        logging.warning(f"get {attr} from request failed:")
-        logging.warning(traceback.format_exc())
-    return default
-'''
-
-
-def truncate_filename(filename, max_length=200):
-    # 获取文件名后缀
-    file_ext = os.path.splitext(filename)[1]
-
-    # 获取不带后缀的文件名
-    file_name_no_ext = os.path.splitext(filename)[0]
-
-    # 计算文件名长度，注意中文字符
-    filename_length = len(filename.encode('utf-8'))
-
-    # 如果文件名长度超过最大长度限制
-    if filename_length > max_length:
-        # 生成一个时间戳标记
-        timestamp = str(int(time.time()))
-        
-        # 计算剩余的文件名长度
-        remaining_length = max_length - len(file_ext) - len(timestamp) - 1  # -1 是为了下划线
-        
-        # 截取文件名并添加标记
-        file_name_no_ext = file_name_no_ext[:remaining_length]
-        new_filename = file_name_no_ext + '_' + timestamp + file_ext
-    else:
-        new_filename = filename
-
-    return new_filename
-
-
-def read_files_with_extensions():
-    # 获取当前脚本文件的路径
-    current_file = os.path.abspath(__file__)
-
-    # 获取当前脚本文件所在的目录
-    current_dir = os.path.dirname(current_file)
-
-    # 获取项目根目录
-    project_dir = os.path.dirname(current_dir)
-
-    directory = project_dir + '/data'
-    print(f'now reading {directory}')
-    extensions = ['.md', '.txt', '.pdf', '.jpg', '.docx', '.xlsx', '.eml', '.csv'] 
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith(tuple(extensions)):
-                file_path = os.path.join(root, file)
-                yield file_path
-
-
-def validate_user_id(user_id):
-    # 定义正则表达式模式
-    pattern = r'^[A-Za-z][A-Za-z0-9_]*$'
-    # 检查是否匹配
-    if isinstance(user_id, str) and re.match(pattern, user_id):
-        return True
-    else:
-        return False
-
-
 def is_chinese(string):
    """
-    使用火山引擎其实可以支持更加广泛的语言检测，未来可以考虑 https://www.volcengine.com/docs/4640/65066
-    判断字符串中大部分是否是中文
-    :param string: {str} 需要检测的字符串
-    :return: {bool} 如果大部分是中文返回True，否则返回False
+    :param string: {str} The string to be detected
+    :return: {bool} Returns True if most are Chinese, False otherwise
    """
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    non_chinese_count = len(pattern.findall(string))
-    # 严格按照字节数量小于一半判断容易误判，英文单词占字节较大,且还有标点符号等
+    # It is easy to misjudge strictly according to the number of bytes less than half.
+    # English words account for a large number of bytes, and there are punctuation marks, etc
    return (non_chinese_count/len(string)) < 0.68


 def extract_and_convert_dates(input_string):
    # 定义匹配不同日期格式的正则表达式
    patterns = [
-        r'(\d{4})-(\d{2})-(\d{2})',  # 匹配YYYY-MM-DD格式
-        r'(\d{4})/(\d{2})/(\d{2})',  # 匹配YYYY/MM/DD格式
-        r'(\d{4})\.(\d{2})\.(\d{2})',  # 匹配YYYY.MM.DD格式
-        r'(\d{4})\\(\d{2})\\(\d{2})',  # 匹配YYYY\MM\DD格式
-        r'(\d{4})(\d{2})(\d{2})'  # 匹配YYYYMMDD格式
+        r'(\d{4})-(\d{2})-(\d{2})',  # YYYY-MM-DD
+        r'(\d{4})/(\d{2})/(\d{2})',  # YYYY/MM/DD
+        r'(\d{4})\.(\d{2})\.(\d{2})',  # YYYY.MM.DD
+        r'(\d{4})\\(\d{2})\\(\d{2})',  # YYYY\MM\DD
+        r'(\d{4})(\d{2})(\d{2})'  # YYYYMMDD
    ]

    matches = []
@ -174,62 +77,21 @@ def get_logger_level() -> str:

 def compare_phrase_with_list(target_phrase, phrase_list, threshold):
    """
-    比较一个目标短语与短语列表中每个短语的相似度。
+    Compare the similarity of a target phrase to each phrase in the phrase list.

-    :param target_phrase: 目标短语 (str)
-    :param phrase_list: 短语列表 (list of str)
-    :param threshold: 相似度阈值 (float)
-    :return: 满足相似度条件的短语列表 (list of str)
+    : Param target_phrase: target phrase (str)
+    : Param phrase_list: list of str
+    : param threshold: similarity threshold (float)
+    : Return: list of phrases that satisfy the similarity condition (list of str)
    """
-    # 检查目标短语是否为空
    if not target_phrase:
-        return []  # 目标短语为空，直接返回空列表
+        return []  # The target phrase is empty, and the empty list is returned directly.

-    # 预处理：对目标短语和短语列表中的每个短语进行分词
+    # Preprocessing: Segmentation of the target phrase and each phrase in the phrase list
    target_tokens = set(jieba.lcut(target_phrase))
    tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}

-    # 比较并筛选
    similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
                       if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]

    return similar_phrases
-
-"""
-# from InternLM/huixiangdou 
-# another awsome work
-    def process_strings(self, str1, replacement, str2):
-        '''Find the longest common suffix of str1 and prefix of str2.'''
-        shared_substring = ''
-        for i in range(1, min(len(str1), len(str2)) + 1):
-            if str1[-i:] == str2[:i]:
-                shared_substring = str1[-i:]
-
-        # If there is a common substring, replace one of them with the replacement string and concatenate  # noqa E501
-        if shared_substring:
-            return str1[:-len(shared_substring)] + replacement + str2
-
-        # Otherwise, just return str1 + str2
-        return str1 + str2
-
-    def clean_md(self, text: str):
-        '''Remove parts of the markdown document that do not contain the key
-        question words, such as code blocks, URL links, etc.'''
-        # remove ref
-        pattern_ref = r'\[(.*?)\]\(.*?\)'
-        new_text = re.sub(pattern_ref, r'\1', text)
-
-        # remove code block
-        pattern_code = r'```.*?```'
-        new_text = re.sub(pattern_code, '', new_text, flags=re.DOTALL)
-
-        # remove underline
-        new_text = re.sub('_{5,}', '', new_text)
-
-        # remove table
-        # new_text = re.sub('\|.*?\|\n\| *\:.*\: *\|.*\n(\|.*\|.*\n)*', '', new_text, flags=re.DOTALL)   # noqa E501
-
-        # use lower
-        new_text = new_text.lower()
-        return new_text
-"""
--- a/core/utils/pb_api.py
+++ b/core/utils/pb_api.py
@ -7,7 +7,7 @@ from typing import BinaryIO
 class PbTalker:
    def __init__(self, logger) -> None:
        # 1. base initialization
-        url = "http://127.0.0.1:5882"
+        url = os.environ.get('PB_API_BASE', "http://127.0.0.1:8090")
        self.logger = logger
        self.logger.debug(f"initializing pocketbase client: {url}")
        self.client = PocketBase(url)
@ -82,7 +82,7 @@ class PbTalker:

    def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
        try:
-            res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
+            res = self.client.collection(collection_name).get_one(item_id, {"fields": ','.join(fields) if fields else ''})
            return vars(res)
        except Exception as e:
            self.logger.error(f"pocketbase view item failed: {e}")
--- a/dashboard/get_report.py
+++ b/dashboard/get_report.py
@ -1,7 +1,7 @@
 import random
 import re
 import os
-from backend.llms.dashscope_wrapper import dashscope_llm
+from core.backend import dashscope_llm
 from docx import Document
 from docx.oxml.ns import qn
 from docx.shared import Pt, RGBColor
--- a/10
+++ b/10
@ -0,0 +1,10 @@
+export LLM_API_KEY=""
+export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper
+##strongly recommended to use the following model provided by siliconflow (combined effect and price)
+export GET_INFO_MODEL="zhipuai/glm4-9B-chat"
+export REWRITE_MODEL="alibaba/Qwen2-7B-Instruct"
+export HTML_PARSE_MODEL="deepseek-ai/deepseek-v2-chat"
+export PROJECT_DIR="work_dir
+export PB_API_AUTH="test@example.com|123467890"
+export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
+export WS_LOG="verbose" ##for detail log info. If not need, just delete this item.
--- a/pb_api.py
+++ b/pb_api.py
@ -1,89 +0,0 @@
-import os
-from pocketbase import PocketBase  # Client also works the same
-from pocketbase.client import FileUpload
-from typing import BinaryIO
-
-
-class PbTalker:
-    def __init__(self, logger) -> None:
-        # 1. base initialization
-        url = "http://127.0.0.1:5882"
-        self.logger = logger
-        self.logger.debug(f"initializing pocketbase client: {url}")
-        self.client = PocketBase(url)
-        auth = os.environ.get('PB_API_AUTH', '')
-        if not auth or "|" not in auth:
-            self.logger.warnning("invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
-        else:
-            email, password = auth.split('|')
-            try:
-                admin_data = self.client.admins.auth_with_password(email, password)
-                if admin_data:
-                    self.logger.info(f"pocketbase ready authenticated as admin - {email}")
-            except:
-                user_data = self.client.collection("users").auth_with_password(email, password)
-                if user_data:
-                    self.logger.info(f"pocketbase ready authenticated as user - {email}")
-                else:
-                    raise Exception("pocketbase auth failed")
-
-    def read(self, collection_name: str, fields: list[str] = None, filter: str = '', skiptotal: bool = True) -> list:
-        results = []
-        for i in range(1, 10):
-            try:
-                res = self.client.collection(collection_name).get_list(i, 500,
-                                                                       {"filter": filter,
-                                                                        "fields": ','.join(fields) if fields else '',
-                                                                        "skiptotal": skiptotal})
-
-            except Exception as e:
-                self.logger.error(f"pocketbase get list failed: {e}")
-                continue
-            if not res.items:
-                break
-            for _res in res.items:
-                attributes = vars(_res)
-                results.append(attributes)
-        return results
-
-    def add(self, collection_name: str, body: dict) -> str:
-        try:
-            res = self.client.collection(collection_name).create(body)
-        except Exception as e:
-            self.logger.error(f"pocketbase create failed: {e}")
-            return ''
-        return res.id
-
-    def update(self, collection_name: str, id: str, body: dict) -> str:
-        try:
-            res = self.client.collection(collection_name).update(id, body)
-        except Exception as e:
-            self.logger.error(f"pocketbase update failed: {e}")
-            return ''
-        return res.id
-
-    def delete(self, collection_name: str, id: str) -> str:
-        try:
-            res = self.client.collection(collection_name).delete(id)
-        except Exception as e:
-            self.logger.error(f"pocketbase update failed: {e}")
-            return 'failed'
-        if res:
-            return 'success'
-        return 'failed'
-
-    def upload(self, collection_name: str, id: str, key: str, file_name: str, file: BinaryIO) -> str:
-        try:
-            res = self.client.collection(collection_name).update(id, {key: FileUpload((file_name, file))})
-        except Exception as e:
-            self.logger.error(f"pocketbase update failed: {e}")
-            return ''
-        return res.id
-
-    def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
-        try:
-            res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
-            return vars(res)
-        except Exception as e:
-            self.logger.error(f"pocketbase view item failed: {e}")
-            return {}