scrapers updated

This commit is contained in:
bigbrother666 2024-06-15 15:41:31 +08:00
parent b1dad1533f
commit 31411cd8f4
22 changed files with 563 additions and 691 deletions

45
core/backend.py Normal file
View File

@ -0,0 +1,45 @@
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from typing import Literal, Optional
from fastapi.middleware.cors import CORSMiddleware
from insights import pipeline
class Request(BaseModel):
"""
Input model
input = {'user_id': str, 'type': str, 'content':str 'addition': Optional[str]}
Type is one of "text", "publicMsg", "site" and "url"
"""
user_id: str
type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"]
content: str
addition: Optional[str] = None
app = FastAPI(
title="WiseFlow Union Backend",
description="From Wiseflow Team.",
version="0.1.1",
openapi_url="/openapi.json"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
def read_root():
msg = "Hello, this is Wise Union Backend, version 0.1.1"
return {"msg": msg}
@app.post("/feed")
async def call_to_feed(background_tasks: BackgroundTasks, request: Request):
background_tasks.add_task(pipeline, _input=request.model_dump())
return {"msg": "received well"}

View File

@ -1,47 +0,0 @@
import asyncio
import websockets
import concurrent.futures
import json
from insights import pipeline
async def get_public_msg():
uri = "ws://127.0.0.1:8066/ws/publicMsg"
reconnect_attempts = 0
max_reconnect_attempts = 3 # 可以根据需要设置最大重连次数
while True:
try:
async with websockets.connect(uri, max_size=10 * 1024 * 1024) as websocket:
loop = asyncio.get_running_loop()
with concurrent.futures.ThreadPoolExecutor() as pool:
while True:
response = await websocket.recv()
datas = json.loads(response)
for data in datas["data"]:
if data["IsSender"] != "0":
print('self-send message, pass')
print(data)
continue
input_data = {
"user_id": data["StrTalker"],
"type": "publicMsg",
"content": data["Content"],
"addition": data["MsgSvrID"]
}
await loop.run_in_executor(pool, pipeline, input_data)
except websockets.exceptions.ConnectionClosedError as e:
print(f"Connection closed with exception: {e}")
reconnect_attempts += 1
if reconnect_attempts <= max_reconnect_attempts:
print(f"Reconnecting attempt {reconnect_attempts}...")
await asyncio.sleep(5) # 等待一段时间后重试
else:
print("Max reconnect attempts reached. Exiting.")
break
except Exception as e:
print(f"An unexpected error occurred: {e}")
break
# 使用asyncio事件循环运行get_public_msg coroutine
asyncio.run(get_public_msg())

7
core/docker_entrypoint.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
set -o allexport
source ../.env
set +o allexport
uvicorn backend:app --reload --host localhost --port 8077
#exec uvicorn backend:app --reload --host localhost --port 8077 &
#exec python background_task.py

View File

@ -1,12 +1,13 @@
from ..scrapers import *
from ..utils.general_utils import extract_urls, compare_phrase_with_list
# -*- coding: utf-8 -*-
from scrapers import *
from utils.general_utils import extract_urls, compare_phrase_with_list
from .get_info import get_info, pb, project_dir, logger, info_rewrite
import os
import json
from datetime import datetime, timedelta
from urllib.parse import urlparse
import re
import time
# The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
@ -18,11 +19,49 @@ expiration_days = 3
existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
def pipeline(_input: dict):
async def get_articles(urls: list[str], expiration: datetime, cache: dict = {}) -> list[dict]:
articles = []
for url in urls:
logger.debug(f"fetching {url}")
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, result = await mp_crawler(url, logger)
else:
flag, result = await simple_crawler(url, logger)
if flag == -7:
# -7 means cannot fetch the html, and other crawlers have no effect.
continue
if flag != 11:
flag, result = await llm_crawler(url, logger)
if flag != 11:
continue
expiration_date = expiration.strftime('%Y-%m-%d')
article_date = int(result['publish_time'])
if article_date < int(expiration_date.replace('-', '')):
logger.info(f"publish date is {article_date}, too old, skip")
continue
if url in cache:
for k, v in cache[url].items():
if v:
result[k] = v
articles.append(result)
return articles
async def pipeline(_input: dict):
cache = {}
source = _input['user_id'].split('@')[-1]
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
global existing_urls
expiration_date = datetime.now() - timedelta(days=expiration_days)
if _input['type'] == 'publicMsg':
items = item_pattern.findall(_input["content"])
# Iterate through all < item > content, extracting < url > and < summary >
@ -37,73 +76,57 @@ def pipeline(_input: dict):
cut_off_point = url.find('chksm=')
if cut_off_point != -1:
url = url[:cut_off_point-1]
if url in existing_urls:
logger.debug(f"{url} has been crawled, skip")
continue
if url in cache:
logger.debug(f"{url} already find in item")
continue
summary_match = summary_pattern.search(item)
summary = summary_match.group(1) if summary_match else None
cache[url] = summary
urls = list(cache.keys())
cache[url] = {'source': source, 'abstract': summary}
articles = await get_articles(list(cache.keys()), expiration_date, cache)
elif _input['type'] == 'site':
# for the site url, Usually an article list page or a website homepage
# need to get the article list page
# You can use a general scraper, or you can customize a site-specific crawler, see scrapers/README_CN.md
urls = extract_urls(_input['content'])
if not urls:
logger.debug(f"can not find any url in\n{_input['content']}")
return
articles = []
for url in urls:
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in scraper_map:
result = scraper_map[domain](url, logger)
else:
result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
articles.extend(result)
elif _input['type'] == 'text':
urls = extract_urls(_input['content'])
if not urls:
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
return
articles = await get_articles(urls, expiration_date)
elif _input['type'] == 'url':
urls = []
pass
# this is remained for wechat shared mp_article_card
# todo will do it in project awada (need finish the generalMsg api first)
articles = []
else:
return
global existing_urls
for url in urls:
if url in existing_urls:
logger.debug(f"{url} has been crawled, skip")
for article in articles:
if article['url'] in existing_urls:
# For the case of entering multiple sites at the same time,
# there is indeed a situation where duplicate articles are mixed into the same batch
logger.debug(f"{article['url']} duplicated, skip")
continue
logger.debug(f"fetching {url}")
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, article = mp_crawler(url, logger)
if flag == -7:
# For mp crawlers, the high probability of -7 is limited by WeChat, just wait 1min.
logger.info(f"fetch {url} failed, try to wait 1min and try again")
time.sleep(60)
flag, article = mp_crawler(url, logger)
else:
parsed_url = urlparse(url)
domain = parsed_url.netloc
if domain in scraper_map:
flag, article = scraper_map[domain](url, logger)
else:
flag, article = simple_crawler(url, logger)
if flag == -7:
# -7 means that the network is different, and other crawlers have no effect.
logger.info(f"cannot fetch {url}")
continue
if flag != 11:
logger.info(f"{url} failed with mp_crawler and simple_crawler")
flag, article = llm_crawler(url, logger)
if flag != 11:
logger.info(f"{url} failed with llm_crawler")
continue
expiration_date = datetime.now() - timedelta(days=expiration_days)
expiration_date = expiration_date.strftime('%Y-%m-%d')
article_date = int(article['publish_time'])
if article_date < int(expiration_date.replace('-', '')):
logger.info(f"publish date is {article_date}, too old, skip")
continue
article['source'] = source
if cache[url]:
article['abstract'] = cache[url]
insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
try:
article_id = pb.add(collection_name='articles', body=article)
except Exception as e:
@ -112,7 +135,7 @@ def pipeline(_input: dict):
json.dump(article, f, ensure_ascii=False, indent=4)
continue
existing_urls.append(url)
existing_urls.append(article['url'])
if not insights:
continue

View File

@ -1,9 +1,9 @@
from ..llms.openai_wrapper import openai_llm
# from ..llms.siliconflow_wrapper import sfa_llm
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
import re
from ..utils.general_utils import get_logger_level
from utils.general_utils import get_logger_level
from loguru import logger
from ..utils.pb_api import PbTalker
from utils.pb_api import PbTalker
import os
import locale

View File

@ -1,13 +1,11 @@
# for developer
如果你只是用户,无需关注这个文件夹。
对于python开发者请使用 backend/pb_api.py 模块进行数据库操作
对于js开发者可以直接启动数据库后在数据库各个collection页面中的api详情查看接口说明
download https://pocketbase.io/docs/
```bash
cd pb
./pocketbase --dev admin create test@example.com 123467890 #如果没有初始账号,请用这个命令创建
xattr -d com.apple.quarantine pocketbase # for Macos
./pocketbase migrate up # for first run
./pocketbase --dev admin create test@example.com 123467890 # If you don't have an initial account, please use this command to create it
./pocketbase serve
```

View File

@ -1,135 +0,0 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((db) => {
const collection = new Collection({
"id": "4rpge043645sp4j",
"created": "2024-04-17 02:46:25.373Z",
"updated": "2024-04-17 02:46:25.373Z",
"name": "roleplays",
"type": "base",
"system": false,
"schema": [
{
"system": false,
"id": "ixk4pwsb",
"name": "activated",
"type": "bool",
"required": false,
"presentable": false,
"unique": false,
"options": {}
},
{
"system": false,
"id": "tmak73c7",
"name": "character",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "6iuxuwhb",
"name": "focus",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "axmc2huy",
"name": "focus_type",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "gop61pjt",
"name": "good_sample1",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "qmy5cofa",
"name": "good_sample2",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "h8gafaci",
"name": "bad_sample",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
},
{
"system": false,
"id": "m2ug5sfd",
"name": "report_type",
"type": "text",
"required": false,
"presentable": false,
"unique": false,
"options": {
"min": null,
"max": null,
"pattern": ""
}
}
],
"indexes": [],
"listRule": null,
"viewRule": null,
"createRule": null,
"updateRule": null,
"deleteRule": null,
"options": {}
});
return Dao(db).saveCollection(collection);
}, (db) => {
const dao = new Dao(db);
const collection = dao.findCollectionByNameOrId("4rpge043645sp4j");
return dao.deleteCollection(collection);
})

View File

@ -5,4 +5,6 @@ gne
jieba
httpx
chardet
websockets
pocketbase
pydantic
uvicorn

View File

@ -1,33 +1,33 @@
**这个文件夹下可以放置对应特定信源的爬虫注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
# 专有爬虫配置
写好爬虫后将爬虫程序放在这个文件夹并在__init__.py下的scraper_map中注册爬虫类似
```python
{'www.securityaffairs.com': securityaffairs_scraper}
```
其中key就是信源地址value是函数名
爬虫应该写为函数形式,出入参约定为:
输入:
- expiration datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
- existings[str], 数据库已有文章的url列表爬虫应该忽略这个列表里面的url
输出:
- [dict]返回结果列表每个dict代表一个文章格式如下
`[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
注意publish_time格式为`"%Y%m%d"` 如果爬虫抓不到可以用当天日期
另外title和content是必须要有的
# 通用页面解析器
我们这里提供了一个通用页面解析器该解析器可以智能获取信源文章列表接下来对于每一个文章url会先尝试使用 gne 进行解析如果失败的话再尝试使用llm进行解析。
通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务,以实现更加理想且更加高效的扫描。**
> **This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
>
> # Custom Crawler Configuration
>
> After writing the crawler, place the crawler program in this folder and register it in the scraper_map in `__init__.py`, similar to:
>
> ```python
> {'www.securityaffairs.com': securityaffairs_scraper}
> ```
>
> Here, the key is the source URL, and the value is the function name.
>
> The crawler should be written in the form of a function with the following input and output specifications:
>
> Input:
> - expiration: A `datetime.date` object, the crawler should only fetch articles on or after this date.
> - existings: [str], a list of URLs of articles already in the database. The crawler should ignore the URLs in this list.
>
> Output:
> - [dict], a list of result dictionaries, each representing an article, formatted as follows:
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
>
> Note: The format of `publish_time` should be `"%Y%m%d"`. If the crawler cannot fetch it, the current date can be used.
>
> Additionally, `title` and `content` are mandatory fields.
>
> # Generic Page Parser
>
> We provide a generic page parser here, which can intelligently fetch article lists from the source. For each article URL, it will first attempt to parse using gne. If it fails, it will then attempt to parse using llm.
>
> Through this solution, it is possible to scan and extract information from most general news and portal sources.
>
> **However, we still strongly recommend that users write custom crawlers themselves or directly subscribe to our data service for more ideal and efficient scanning.**

View File

@ -0,0 +1,33 @@
**这个文件夹下可以放置对应特定信源的爬虫注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
# 专有爬虫配置
写好爬虫后将爬虫程序放在这个文件夹并在__init__.py下的scraper_map中注册爬虫类似
```python
{'www.securityaffairs.com': securityaffairs_scraper}
```
其中key就是信源地址value是函数名
爬虫应该写为函数形式,出入参约定为:
输入:
- expiration datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
- existings[str], 数据库已有文章的url列表爬虫应该忽略这个列表里面的url
输出:
- [dict]返回结果列表每个dict代表一个文章格式如下
`[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
注意publish_time格式为`"%Y%m%d"` 如果爬虫抓不到可以用当天日期
另外title和content是必须要有的
# 通用页面解析器
我们这里提供了一个通用页面解析器该解析器可以智能获取信源文章列表接下来对于每一个文章url会先尝试使用 gne 进行解析如果失败的话再尝试使用llm进行解析。
通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务,以实现更加理想且更加高效的扫描。**

View File

@ -0,0 +1,34 @@
> **In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
>
> # Konfiguration des benutzerdefinierten Crawlers
>
> Nachdem Sie den Crawler geschrieben haben, platzieren Sie das Crawler-Programm in diesem Ordner und registrieren Sie es in scraper_map in `__init__.py`, ähnlich wie:
>
> ```python
> {'www.securityaffairs.com': securityaffairs_scraper}
> ```
>
> Hier ist der Schlüssel die URL der Quelle und der Wert der Funktionsname.
>
> Der Crawler sollte in Form einer Funktion geschrieben werden, mit den folgenden Eingabe- und Ausgabeparametern:
>
> Eingabe:
> - expiration: Ein `datetime.date` Objekt, der Crawler sollte nur Artikel ab diesem Datum (einschließlich) abrufen.
> - existings: [str], eine Liste von URLs von Artikeln, die bereits in der Datenbank vorhanden sind. Der Crawler sollte die URLs in dieser Liste ignorieren.
>
> Ausgabe:
> - [dict], eine Liste von Ergebnis-Wörterbüchern, wobei jedes Wörterbuch einen Artikel darstellt, formatiert wie folgt:
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
>
> Hinweis: Das Format von `publish_time` sollte `"%Y%m%d"` sein. Wenn der Crawler es nicht abrufen kann, kann das aktuelle Datum verwendet werden.
>
> Darüber hinaus sind `title` und `content` Pflichtfelder.
>
> # Generischer Seitenparser
>
> Wir bieten hier einen generischen Seitenparser an, der intelligent Artikellisten von der Quelle abrufen kann. Für jede Artikel-URL wird zunächst versucht, mit gne zu parsen. Scheitert dies, wird versucht, mit llm zu parsen.
>
> Durch diese Lösung ist es möglich, die meisten allgemeinen Nachrichtenquellen und Portale zu scannen und Informationen zu extrahieren.
>
> **Wir empfehlen jedoch dringend, dass Benutzer eigene benutzerdefinierte Crawlers schreiben oder direkt unseren Datenservice abonnieren, um eine idealere und effizientere Erfassung zu erreichen.**

View File

@ -0,0 +1,34 @@
> **Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
>
> # Configuration du Crawler Personnalisé
>
> Après avoir écrit le crawler, placez le programme du crawler dans ce dossier et enregistrez-le dans scraper_map dans `__init__.py`, comme suit :
>
> ```python
> {'www.securityaffairs.com': securityaffairs_scraper}
> ```
>
> Ici, la clé est l'URL de la source, et la valeur est le nom de la fonction.
>
> Le crawler doit être écrit sous forme de fonction avec les spécifications suivantes pour les entrées et sorties :
>
> Entrée :
> - expiration : Un objet `datetime.date`, le crawler ne doit récupérer que les articles à partir de cette date (incluse).
> - existings : [str], une liste d'URLs d'articles déjà présents dans la base de données. Le crawler doit ignorer les URLs de cette liste.
>
> Sortie :
> - [dict], une liste de dictionnaires de résultats, chaque dictionnaire représentant un article, formaté comme suit :
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
>
> Remarque : Le format de `publish_time` doit être `"%Y%m%d"`. Si le crawler ne peut pas le récupérer, la date du jour peut être utilisée.
>
> De plus, `title` et `content` sont des champs obligatoires.
>
> # Analyseur de Page Générique
>
> Nous fournissons ici un analyseur de page générique, qui peut récupérer intelligemment les listes d'articles de la source. Pour chaque URL d'article, il tentera d'abord de parser avec gne. En cas d'échec, il tentera de parser avec llm.
>
> Grâce à cette solution, il est possible de scanner et d'extraire des informations à partir de la plupart des sources de type actualités générales et portails.
>
> **Cependant, nous recommandons vivement aux utilisateurs de rédiger eux-mêmes des crawlers personnalisés ou de s'abonner directement à notre service de données pour un scan plus idéal et plus efficace.**

View File

@ -0,0 +1,33 @@
**このフォルダには特定のソースに対応したクローラーを配置できます。ここでのクローラーはソースの記事リストURLを解析し、記事の詳細情報を辞書形式で返す必要があります。**
>
> # カスタムクローラーの設定
>
> クローラーを作成した後、そのプログラムをこのフォルダに配置し、`__init__.py` の scraper_map に次のように登録します:
>
> ```python
> {'www.securityaffairs.com': securityaffairs_scraper}
> ```
>
> ここで、キーはソースのURLで、値は関数名です。
>
> クローラーは関数形式で記述し、以下の入力および出力仕様を満たす必要があります:
>
> 入力:
> - expiration `datetime.date` オブジェクト、クローラーはこの日付以降(この日を含む)の記事のみを取得する必要があります。
> - existings[str]、データベースに既存する記事のURLリスト、クローラーはこのリスト内のURLを無視する必要があります。
>
> 出力:
> - [dict]、結果の辞書リスト、各辞書は以下の形式で1つの記事を表します
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
>
> 注意:`publish_time`の形式は`"%Y%m%d"`である必要があります。クローラーで取得できない場合は、当日の日付を使用できます。
>
> さらに、`title`と`content`は必須フィールドです。
>
> # 一般ページパーサー
>
> ここでは一般的なページパーサーを提供しており、ソースから記事リストをインテリジェントに取得できます。各記事URLに対して、最初に gne を使用して解析を試みます。失敗した場合は、llm を使用して解析を試みます。
>
> このソリューションにより、ほとんどの一般的なニュースおよびポータルソースのスキャンと情報抽出が可能になります。
>
> **しかし、より理想的かつ効率的なスキャンを実現するために、ユーザー自身でカスタムクローラーを作成するか、直接弊社のデータサービスを購読することを強くお勧めします。**

View File

@ -1,6 +1,6 @@
from .mp_crawler import mp_crawler
from .simple_crawler import simple_crawler
from .general_scraper import llm_crawler
from .general_scraper import general_scraper, llm_crawler
scraper_map = {}

View File

@ -1,16 +1,20 @@
# -*- coding: utf-8 -*-
import os
from urllib.parse import urlparse
import re
from .simple_crawler import simple_crawler
from .mp_crawler import mp_crawler
import httpx
from bs4 import BeautifulSoup
from bs4.element import Comment
from ..llms.openai_wrapper import openai_llm
# from ..llms.siliconflow_wrapper import sfa_llm
from llms.openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from datetime import datetime, date
from requests.compat import urljoin
import chardet
from ..utils.general_utils import extract_and_convert_dates
from utils.general_utils import extract_and_convert_dates
import asyncio
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
@ -63,7 +67,6 @@ def parse_html_content(out: str) -> dict:
return dct
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
"""
Title||Summary||Content||Release Date YYYY-MM-DD
@ -71,30 +74,38 @@ Title||Summary||Content||Release Date YYYY-MM-DD
'''
def llm_crawler(url: str, logger) -> (int, dict):
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding)
except Exception as e:
logger.error(e)
return -7, {}
async def llm_crawler(url: str, logger) -> (int, dict):
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {url} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {url} got error {e}")
return -7, {}
soup = BeautifulSoup(text, "html.parser")
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
logger.warning(f"{url} content too long for llm parsing")
return 0, {}
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding, errors='replace')
soup = BeautifulSoup(text, "html.parser")
html_text = text_from_soup(soup)
html_lines = html_text.split('\n')
html_lines = [line.strip() for line in html_lines if line.strip()]
html_text = "\n".join(html_lines)
if len(html_text) > 29999:
logger.warning(f"{url} content too long for llm parsing")
return 0, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
or html_text.startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
'您访问的页面') or html_text.startswith('403') \
or html_text.startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
messages = [
{"role": "system", "content": sys_info},
@ -103,7 +114,7 @@ def llm_crawler(url: str, logger) -> (int, dict):
llm_output = openai_llm(messages, model=model, logger=logger)
try:
info = parse_html_content(llm_output)
except Exception:
except:
msg = f"can not parse {llm_output}"
logger.debug(msg)
return 0, {}
@ -146,31 +157,49 @@ def llm_crawler(url: str, logger) -> (int, dict):
return 11, info
def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
try:
with httpx.Client() as client:
response = client.get(site, headers=header, timeout=30)
except Exception as e:
logger.error(e)
return []
async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(site, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {site} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {site} got error {e}")
return []
page_source = response.text
soup = BeautifulSoup(page_source, "html.parser")
# Parse all URLs
parsed_url = urlparse(site)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
page_source = response.text
soup = BeautifulSoup(page_source, "html.parser")
# Parse all URLs
parsed_url = urlparse(site)
base_url = parsed_url.scheme + '://' + parsed_url.netloc
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
if not urls:
# maybe it's an article site
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
if site in existing:
logger.debug(f"{site} has been crawled before, skip it")
return []
flag, result = simple_crawler(site, logger)
if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):
flag, result = await mp_crawler(site, logger)
else:
flag, result = await simple_crawler(site, logger)
if flag == -7:
# -7 means cannot fetch the html, and other crawlers have no effect.
return []
if flag != 11:
flag, result = llm_crawler(site, logger)
flag, result = await llm_crawler(site, logger)
if flag != 11:
return []
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
if publish_date.date() < expiration:
logger.debug(f"{site} is too old, skip it")
@ -183,12 +212,23 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger) ->
if url in existing:
logger.debug(f"{url} has been crawled before, skip it")
continue
existing.append(url)
flag, result = simple_crawler(url, logger)
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
flag, result = await mp_crawler(url, logger)
else:
flag, result = await simple_crawler(url, logger)
if flag == -7:
# -7 means cannot fetch the html, and other crawlers have no effect.
continue
if flag != 11:
flag, result = llm_crawler(url, logger)
flag, result = await llm_crawler(url, logger)
if flag != 11:
continue
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
if publish_date.date() < expiration:
logger.debug(f"{url} is too old, skip it")

View File

@ -1,102 +1,112 @@
# -*- coding: utf-8 -*-
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re
import asyncio
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def mp_crawler(url: str, logger) -> (int, dict):
async def mp_crawler(url: str, logger) -> (int, dict):
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
url = url.replace("http://", "https://", 1)
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {url} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {url} got error {e}")
return -7, {}
soup = BeautifulSoup(response.text, 'html.parser')
soup = BeautifulSoup(response.text, 'html.parser')
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
if card_info \
else soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
return -7, {}
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
if card_info \
else soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
return -7, {}
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"[from {profile_nickname}]{content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"[from {profile_nickname}]{summary}"
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"[from {profile_nickname}]{content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"[from {profile_nickname}]{summary}"
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}"
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}"
return 11, {
'title': rich_media_title,

View File

@ -1,10 +1,13 @@
# -*- coding: utf-8 -*-
from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from ..utils.general_utils import extract_and_convert_dates
from utils.general_utils import extract_and_convert_dates
import chardet
from urllib.parse import urlparse
import asyncio
extractor = GeneralNewsExtractor()
@ -12,54 +15,63 @@ header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def simple_crawler(url: str, logger) -> (int, dict):
async def simple_crawler(url: str, logger) -> (int, dict):
"""
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
"""
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding)
async with httpx.AsyncClient() as client:
for retry in range(2):
try:
response = await client.get(url, headers=header, timeout=30)
response.raise_for_status()
break
except Exception as e:
if retry < 1:
logger.info(f"request {url} got error {e}\nwaiting 1min")
await asyncio.sleep(60)
else:
logger.warning(f"request {url} got error {e}")
return -7, {}
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
text = rawdata.decode(encoding, errors='replace')
result = extractor.extract(text)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
if not result:
logger.error(f"gne cannot extract {url}")
return 0, {}
if not result:
logger.error(f"gne cannot extract {url}")
return 0, {}
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"{result} not valid")
return 0, {}
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"{result} not valid")
return 0, {}
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
'title'].startswith('403') \
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
return -7, {}
date_str = extract_and_convert_dates(result['publish_time'])
if date_str:
result['publish_time'] = date_str
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
from_site = urlparse(url).netloc
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
result['content'] = f"[from {from_site}] {result['content']}"
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
date_str = extract_and_convert_dates(result['publish_time'])
if date_str:
result['publish_time'] = date_str
else:
result['abstract'] = ''
except Exception:
result['abstract'] = ''
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
from_site = urlparse(url).netloc
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
result['content'] = f"[from {from_site}] {result['content']}"
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
else:
result['abstract'] = ''
except:
result['abstract'] = ''
result['url'] = url
result['url'] = url
return 11, result

View File

@ -1,9 +1,4 @@
"""mostly copy from https://github.com/netease-youdao/QAnything
awsome work!
"""
# import traceback
from urllib.parse import urlparse
import time
import os
import re
import jieba
@ -18,131 +13,39 @@ def extract_urls(text):
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
urls = re.findall(url_pattern, text)
# 过滤掉那些只匹配到 'www.' 而没有后续内容的情况并尝试为每个URL添加默认的http协议前缀以便解析
# Filter out those cases that only match to'www. 'without subsequent content,
# and try to add the default http protocol prefix to each URL for easy parsing
cleaned_urls = [url for url in urls if isURL(url)]
return cleaned_urls
def isChinesePunctuation(char):
# 定义中文标点符号的Unicode编码范围
# Define the Unicode encoding range for Chinese punctuation marks
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
# 检查字符是否在上述范围内
# Check if the character is within the above range
return ord(char) in chinese_punctuations
def get_time(func):
def inner(*arg, **kwargs):
s_time = time.time()
res = func(*arg, **kwargs)
e_time = time.time()
print('函数 {} 执行耗时: {}'.format(func.__name__, e_time - s_time))
return res
return inner
'''
def safe_get(req: Request, attr: str, default=None):
try:
if attr in req.form:
return req.form.getlist(attr)[0]
if attr in req.args:
return req.args[attr]
if attr in req.json:
return req.json[attr]
# if value := req.form.get(attr):
# return value
# if value := req.args.get(attr):
# return value
# """req.json执行时不校验content-typebody字段可能不能被正确解析为json"""
# if value := req.json.get(attr):
# return value
except BadRequest:
logging.warning(f"missing {attr} in request")
except Exception as e:
logging.warning(f"get {attr} from request failed:")
logging.warning(traceback.format_exc())
return default
'''
def truncate_filename(filename, max_length=200):
# 获取文件名后缀
file_ext = os.path.splitext(filename)[1]
# 获取不带后缀的文件名
file_name_no_ext = os.path.splitext(filename)[0]
# 计算文件名长度,注意中文字符
filename_length = len(filename.encode('utf-8'))
# 如果文件名长度超过最大长度限制
if filename_length > max_length:
# 生成一个时间戳标记
timestamp = str(int(time.time()))
# 计算剩余的文件名长度
remaining_length = max_length - len(file_ext) - len(timestamp) - 1 # -1 是为了下划线
# 截取文件名并添加标记
file_name_no_ext = file_name_no_ext[:remaining_length]
new_filename = file_name_no_ext + '_' + timestamp + file_ext
else:
new_filename = filename
return new_filename
def read_files_with_extensions():
# 获取当前脚本文件的路径
current_file = os.path.abspath(__file__)
# 获取当前脚本文件所在的目录
current_dir = os.path.dirname(current_file)
# 获取项目根目录
project_dir = os.path.dirname(current_dir)
directory = project_dir + '/data'
print(f'now reading {directory}')
extensions = ['.md', '.txt', '.pdf', '.jpg', '.docx', '.xlsx', '.eml', '.csv']
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(tuple(extensions)):
file_path = os.path.join(root, file)
yield file_path
def validate_user_id(user_id):
# 定义正则表达式模式
pattern = r'^[A-Za-z][A-Za-z0-9_]*$'
# 检查是否匹配
if isinstance(user_id, str) and re.match(pattern, user_id):
return True
else:
return False
def is_chinese(string):
"""
使用火山引擎其实可以支持更加广泛的语言检测未来可以考虑 https://www.volcengine.com/docs/4640/65066
判断字符串中大部分是否是中文
:param string: {str} 需要检测的字符串
:return: {bool} 如果大部分是中文返回True否则返回False
:param string: {str} The string to be detected
:return: {bool} Returns True if most are Chinese, False otherwise
"""
pattern = re.compile(r'[^\u4e00-\u9fa5]')
non_chinese_count = len(pattern.findall(string))
# 严格按照字节数量小于一半判断容易误判,英文单词占字节较大,且还有标点符号等
# It is easy to misjudge strictly according to the number of bytes less than half.
# English words account for a large number of bytes, and there are punctuation marks, etc
return (non_chinese_count/len(string)) < 0.68
def extract_and_convert_dates(input_string):
# 定义匹配不同日期格式的正则表达式
patterns = [
r'(\d{4})-(\d{2})-(\d{2})', # 匹配YYYY-MM-DD格式
r'(\d{4})/(\d{2})/(\d{2})', # 匹配YYYY/MM/DD格式
r'(\d{4})\.(\d{2})\.(\d{2})', # 匹配YYYY.MM.DD格式
r'(\d{4})\\(\d{2})\\(\d{2})', # 匹配YYYY\MM\DD格式
r'(\d{4})(\d{2})(\d{2})' # 匹配YYYYMMDD格式
r'(\d{4})-(\d{2})-(\d{2})', # YYYY-MM-DD
r'(\d{4})/(\d{2})/(\d{2})', # YYYY/MM/DD
r'(\d{4})\.(\d{2})\.(\d{2})', # YYYY.MM.DD
r'(\d{4})\\(\d{2})\\(\d{2})', # YYYY\MM\DD
r'(\d{4})(\d{2})(\d{2})' # YYYYMMDD
]
matches = []
@ -174,62 +77,21 @@ def get_logger_level() -> str:
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
"""
比较一个目标短语与短语列表中每个短语的相似度
Compare the similarity of a target phrase to each phrase in the phrase list.
:param target_phrase: 目标短语 (str)
:param phrase_list: 短语列表 (list of str)
:param threshold: 相似度阈值 (float)
:return: 满足相似度条件的短语列表 (list of str)
: Param target_phrase: target phrase (str)
: Param phrase_list: list of str
: param threshold: similarity threshold (float)
: Return: list of phrases that satisfy the similarity condition (list of str)
"""
# 检查目标短语是否为空
if not target_phrase:
return [] # 目标短语为空,直接返回空列表
return [] # The target phrase is empty, and the empty list is returned directly.
# 预处理:对目标短语和短语列表中的每个短语进行分词
# Preprocessing: Segmentation of the target phrase and each phrase in the phrase list
target_tokens = set(jieba.lcut(target_phrase))
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
# 比较并筛选
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
return similar_phrases
"""
# from InternLM/huixiangdou
# another awsome work
def process_strings(self, str1, replacement, str2):
'''Find the longest common suffix of str1 and prefix of str2.'''
shared_substring = ''
for i in range(1, min(len(str1), len(str2)) + 1):
if str1[-i:] == str2[:i]:
shared_substring = str1[-i:]
# If there is a common substring, replace one of them with the replacement string and concatenate # noqa E501
if shared_substring:
return str1[:-len(shared_substring)] + replacement + str2
# Otherwise, just return str1 + str2
return str1 + str2
def clean_md(self, text: str):
'''Remove parts of the markdown document that do not contain the key
question words, such as code blocks, URL links, etc.'''
# remove ref
pattern_ref = r'\[(.*?)\]\(.*?\)'
new_text = re.sub(pattern_ref, r'\1', text)
# remove code block
pattern_code = r'```.*?```'
new_text = re.sub(pattern_code, '', new_text, flags=re.DOTALL)
# remove underline
new_text = re.sub('_{5,}', '', new_text)
# remove table
# new_text = re.sub('\|.*?\|\n\| *\:.*\: *\|.*\n(\|.*\|.*\n)*', '', new_text, flags=re.DOTALL) # noqa E501
# use lower
new_text = new_text.lower()
return new_text
"""

View File

@ -7,7 +7,7 @@ from typing import BinaryIO
class PbTalker:
def __init__(self, logger) -> None:
# 1. base initialization
url = "http://127.0.0.1:5882"
url = os.environ.get('PB_API_BASE', "http://127.0.0.1:8090")
self.logger = logger
self.logger.debug(f"initializing pocketbase client: {url}")
self.client = PocketBase(url)
@ -82,7 +82,7 @@ class PbTalker:
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
try:
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
res = self.client.collection(collection_name).get_one(item_id, {"fields": ','.join(fields) if fields else ''})
return vars(res)
except Exception as e:
self.logger.error(f"pocketbase view item failed: {e}")

View File

@ -1,7 +1,7 @@
import random
import re
import os
from backend.llms.dashscope_wrapper import dashscope_llm
from core.backend import dashscope_llm
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor

10
env_sample Executable file
View File

@ -0,0 +1,10 @@
export LLM_API_KEY=""
export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper
##strongly recommended to use the following model provided by siliconflow (combined effect and price)
export GET_INFO_MODEL="zhipuai/glm4-9B-chat"
export REWRITE_MODEL="alibaba/Qwen2-7B-Instruct"
export HTML_PARSE_MODEL="deepseek-ai/deepseek-v2-chat"
export PROJECT_DIR="work_dir
export PB_API_AUTH="test@example.com|123467890"
export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
export WS_LOG="verbose" ##for detail log info. If not need, just delete this item.

View File

@ -1,89 +0,0 @@
import os
from pocketbase import PocketBase # Client also works the same
from pocketbase.client import FileUpload
from typing import BinaryIO
class PbTalker:
def __init__(self, logger) -> None:
# 1. base initialization
url = "http://127.0.0.1:5882"
self.logger = logger
self.logger.debug(f"initializing pocketbase client: {url}")
self.client = PocketBase(url)
auth = os.environ.get('PB_API_AUTH', '')
if not auth or "|" not in auth:
self.logger.warnning("invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
else:
email, password = auth.split('|')
try:
admin_data = self.client.admins.auth_with_password(email, password)
if admin_data:
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
except:
user_data = self.client.collection("users").auth_with_password(email, password)
if user_data:
self.logger.info(f"pocketbase ready authenticated as user - {email}")
else:
raise Exception("pocketbase auth failed")
def read(self, collection_name: str, fields: list[str] = None, filter: str = '', skiptotal: bool = True) -> list:
results = []
for i in range(1, 10):
try:
res = self.client.collection(collection_name).get_list(i, 500,
{"filter": filter,
"fields": ','.join(fields) if fields else '',
"skiptotal": skiptotal})
except Exception as e:
self.logger.error(f"pocketbase get list failed: {e}")
continue
if not res.items:
break
for _res in res.items:
attributes = vars(_res)
results.append(attributes)
return results
def add(self, collection_name: str, body: dict) -> str:
try:
res = self.client.collection(collection_name).create(body)
except Exception as e:
self.logger.error(f"pocketbase create failed: {e}")
return ''
return res.id
def update(self, collection_name: str, id: str, body: dict) -> str:
try:
res = self.client.collection(collection_name).update(id, body)
except Exception as e:
self.logger.error(f"pocketbase update failed: {e}")
return ''
return res.id
def delete(self, collection_name: str, id: str) -> str:
try:
res = self.client.collection(collection_name).delete(id)
except Exception as e:
self.logger.error(f"pocketbase update failed: {e}")
return 'failed'
if res:
return 'success'
return 'failed'
def upload(self, collection_name: str, id: str, key: str, file_name: str, file: BinaryIO) -> str:
try:
res = self.client.collection(collection_name).update(id, {key: FileUpload((file_name, file))})
except Exception as e:
self.logger.error(f"pocketbase update failed: {e}")
return ''
return res.id
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
try:
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
return vars(res)
except Exception as e:
self.logger.error(f"pocketbase view item failed: {e}")
return {}