mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 10:50:25 +08:00
scrapers updated
This commit is contained in:
parent
b1dad1533f
commit
31411cd8f4
45
core/backend.py
Normal file
45
core/backend.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from fastapi import FastAPI, BackgroundTasks
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Literal, Optional
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from insights import pipeline
|
||||||
|
|
||||||
|
|
||||||
|
class Request(BaseModel):
|
||||||
|
"""
|
||||||
|
Input model
|
||||||
|
input = {'user_id': str, 'type': str, 'content':str, 'addition': Optional[str]}
|
||||||
|
Type is one of "text", "publicMsg", "site" and "url";
|
||||||
|
"""
|
||||||
|
user_id: str
|
||||||
|
type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"]
|
||||||
|
content: str
|
||||||
|
addition: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="WiseFlow Union Backend",
|
||||||
|
description="From Wiseflow Team.",
|
||||||
|
version="0.1.1",
|
||||||
|
openapi_url="/openapi.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def read_root():
|
||||||
|
msg = "Hello, this is Wise Union Backend, version 0.1.1"
|
||||||
|
return {"msg": msg}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/feed")
|
||||||
|
async def call_to_feed(background_tasks: BackgroundTasks, request: Request):
|
||||||
|
background_tasks.add_task(pipeline, _input=request.model_dump())
|
||||||
|
return {"msg": "received well"}
|
47
core/dm.py
47
core/dm.py
@ -1,47 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import websockets
|
|
||||||
import concurrent.futures
|
|
||||||
import json
|
|
||||||
from insights import pipeline
|
|
||||||
|
|
||||||
|
|
||||||
async def get_public_msg():
|
|
||||||
uri = "ws://127.0.0.1:8066/ws/publicMsg"
|
|
||||||
reconnect_attempts = 0
|
|
||||||
max_reconnect_attempts = 3 # 可以根据需要设置最大重连次数
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
async with websockets.connect(uri, max_size=10 * 1024 * 1024) as websocket:
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
||||||
while True:
|
|
||||||
response = await websocket.recv()
|
|
||||||
datas = json.loads(response)
|
|
||||||
for data in datas["data"]:
|
|
||||||
if data["IsSender"] != "0":
|
|
||||||
print('self-send message, pass')
|
|
||||||
print(data)
|
|
||||||
continue
|
|
||||||
input_data = {
|
|
||||||
"user_id": data["StrTalker"],
|
|
||||||
"type": "publicMsg",
|
|
||||||
"content": data["Content"],
|
|
||||||
"addition": data["MsgSvrID"]
|
|
||||||
}
|
|
||||||
await loop.run_in_executor(pool, pipeline, input_data)
|
|
||||||
except websockets.exceptions.ConnectionClosedError as e:
|
|
||||||
print(f"Connection closed with exception: {e}")
|
|
||||||
reconnect_attempts += 1
|
|
||||||
if reconnect_attempts <= max_reconnect_attempts:
|
|
||||||
print(f"Reconnecting attempt {reconnect_attempts}...")
|
|
||||||
await asyncio.sleep(5) # 等待一段时间后重试
|
|
||||||
else:
|
|
||||||
print("Max reconnect attempts reached. Exiting.")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"An unexpected error occurred: {e}")
|
|
||||||
break
|
|
||||||
|
|
||||||
# 使用asyncio事件循环运行get_public_msg coroutine
|
|
||||||
asyncio.run(get_public_msg())
|
|
7
core/docker_entrypoint.sh
Executable file
7
core/docker_entrypoint.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -o allexport
|
||||||
|
source ../.env
|
||||||
|
set +o allexport
|
||||||
|
uvicorn backend:app --reload --host localhost --port 8077
|
||||||
|
#exec uvicorn backend:app --reload --host localhost --port 8077 &
|
||||||
|
#exec python background_task.py
|
@ -1,12 +1,13 @@
|
|||||||
from ..scrapers import *
|
# -*- coding: utf-8 -*-
|
||||||
from ..utils.general_utils import extract_urls, compare_phrase_with_list
|
|
||||||
|
from scrapers import *
|
||||||
|
from utils.general_utils import extract_urls, compare_phrase_with_list
|
||||||
from .get_info import get_info, pb, project_dir, logger, info_rewrite
|
from .get_info import get_info, pb, project_dir, logger, info_rewrite
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
# The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
|
# The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
|
||||||
@ -18,11 +19,49 @@ expiration_days = 3
|
|||||||
existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
|
existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
|
||||||
|
|
||||||
|
|
||||||
def pipeline(_input: dict):
|
async def get_articles(urls: list[str], expiration: datetime, cache: dict = {}) -> list[dict]:
|
||||||
|
articles = []
|
||||||
|
for url in urls:
|
||||||
|
logger.debug(f"fetching {url}")
|
||||||
|
|
||||||
|
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
|
||||||
|
flag, result = await mp_crawler(url, logger)
|
||||||
|
else:
|
||||||
|
flag, result = await simple_crawler(url, logger)
|
||||||
|
|
||||||
|
if flag == -7:
|
||||||
|
# -7 means cannot fetch the html, and other crawlers have no effect.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if flag != 11:
|
||||||
|
flag, result = await llm_crawler(url, logger)
|
||||||
|
if flag != 11:
|
||||||
|
continue
|
||||||
|
|
||||||
|
expiration_date = expiration.strftime('%Y-%m-%d')
|
||||||
|
article_date = int(result['publish_time'])
|
||||||
|
if article_date < int(expiration_date.replace('-', '')):
|
||||||
|
logger.info(f"publish date is {article_date}, too old, skip")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if url in cache:
|
||||||
|
for k, v in cache[url].items():
|
||||||
|
if v:
|
||||||
|
result[k] = v
|
||||||
|
|
||||||
|
articles.append(result)
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
async def pipeline(_input: dict):
|
||||||
cache = {}
|
cache = {}
|
||||||
source = _input['user_id'].split('@')[-1]
|
source = _input['user_id'].split('@')[-1]
|
||||||
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
|
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
|
||||||
|
|
||||||
|
global existing_urls
|
||||||
|
expiration_date = datetime.now() - timedelta(days=expiration_days)
|
||||||
|
|
||||||
if _input['type'] == 'publicMsg':
|
if _input['type'] == 'publicMsg':
|
||||||
items = item_pattern.findall(_input["content"])
|
items = item_pattern.findall(_input["content"])
|
||||||
# Iterate through all < item > content, extracting < url > and < summary >
|
# Iterate through all < item > content, extracting < url > and < summary >
|
||||||
@ -37,73 +76,57 @@ def pipeline(_input: dict):
|
|||||||
cut_off_point = url.find('chksm=')
|
cut_off_point = url.find('chksm=')
|
||||||
if cut_off_point != -1:
|
if cut_off_point != -1:
|
||||||
url = url[:cut_off_point-1]
|
url = url[:cut_off_point-1]
|
||||||
|
if url in existing_urls:
|
||||||
|
logger.debug(f"{url} has been crawled, skip")
|
||||||
|
continue
|
||||||
if url in cache:
|
if url in cache:
|
||||||
logger.debug(f"{url} already find in item")
|
logger.debug(f"{url} already find in item")
|
||||||
continue
|
continue
|
||||||
summary_match = summary_pattern.search(item)
|
summary_match = summary_pattern.search(item)
|
||||||
summary = summary_match.group(1) if summary_match else None
|
summary = summary_match.group(1) if summary_match else None
|
||||||
cache[url] = summary
|
cache[url] = {'source': source, 'abstract': summary}
|
||||||
urls = list(cache.keys())
|
articles = await get_articles(list(cache.keys()), expiration_date, cache)
|
||||||
|
|
||||||
|
elif _input['type'] == 'site':
|
||||||
|
# for the site url, Usually an article list page or a website homepage
|
||||||
|
# need to get the article list page
|
||||||
|
# You can use a general scraper, or you can customize a site-specific crawler, see scrapers/README_CN.md
|
||||||
|
urls = extract_urls(_input['content'])
|
||||||
|
if not urls:
|
||||||
|
logger.debug(f"can not find any url in\n{_input['content']}")
|
||||||
|
return
|
||||||
|
articles = []
|
||||||
|
for url in urls:
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
domain = parsed_url.netloc
|
||||||
|
if domain in scraper_map:
|
||||||
|
result = scraper_map[domain](url, logger)
|
||||||
|
else:
|
||||||
|
result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
|
||||||
|
articles.extend(result)
|
||||||
|
|
||||||
elif _input['type'] == 'text':
|
elif _input['type'] == 'text':
|
||||||
urls = extract_urls(_input['content'])
|
urls = extract_urls(_input['content'])
|
||||||
if not urls:
|
if not urls:
|
||||||
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
|
logger.debug(f"can not find any url in\n{_input['content']}\npass...")
|
||||||
return
|
return
|
||||||
|
articles = await get_articles(urls, expiration_date)
|
||||||
|
|
||||||
elif _input['type'] == 'url':
|
elif _input['type'] == 'url':
|
||||||
urls = []
|
# this is remained for wechat shared mp_article_card
|
||||||
pass
|
# todo will do it in project awada (need finish the generalMsg api first)
|
||||||
|
articles = []
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
global existing_urls
|
for article in articles:
|
||||||
|
if article['url'] in existing_urls:
|
||||||
for url in urls:
|
# For the case of entering multiple sites at the same time,
|
||||||
if url in existing_urls:
|
# there is indeed a situation where duplicate articles are mixed into the same batch
|
||||||
logger.debug(f"{url} has been crawled, skip")
|
logger.debug(f"{article['url']} duplicated, skip")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(f"fetching {url}")
|
|
||||||
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
|
|
||||||
flag, article = mp_crawler(url, logger)
|
|
||||||
if flag == -7:
|
|
||||||
# For mp crawlers, the high probability of -7 is limited by WeChat, just wait 1min.
|
|
||||||
logger.info(f"fetch {url} failed, try to wait 1min and try again")
|
|
||||||
time.sleep(60)
|
|
||||||
flag, article = mp_crawler(url, logger)
|
|
||||||
else:
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
domain = parsed_url.netloc
|
|
||||||
if domain in scraper_map:
|
|
||||||
flag, article = scraper_map[domain](url, logger)
|
|
||||||
else:
|
|
||||||
flag, article = simple_crawler(url, logger)
|
|
||||||
|
|
||||||
if flag == -7:
|
|
||||||
# -7 means that the network is different, and other crawlers have no effect.
|
|
||||||
logger.info(f"cannot fetch {url}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if flag != 11:
|
|
||||||
logger.info(f"{url} failed with mp_crawler and simple_crawler")
|
|
||||||
flag, article = llm_crawler(url, logger)
|
|
||||||
if flag != 11:
|
|
||||||
logger.info(f"{url} failed with llm_crawler")
|
|
||||||
continue
|
|
||||||
|
|
||||||
expiration_date = datetime.now() - timedelta(days=expiration_days)
|
|
||||||
expiration_date = expiration_date.strftime('%Y-%m-%d')
|
|
||||||
article_date = int(article['publish_time'])
|
|
||||||
if article_date < int(expiration_date.replace('-', '')):
|
|
||||||
logger.info(f"publish date is {article_date}, too old, skip")
|
|
||||||
continue
|
|
||||||
|
|
||||||
article['source'] = source
|
|
||||||
if cache[url]:
|
|
||||||
article['abstract'] = cache[url]
|
|
||||||
|
|
||||||
insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
|
insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
article_id = pb.add(collection_name='articles', body=article)
|
article_id = pb.add(collection_name='articles', body=article)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -112,7 +135,7 @@ def pipeline(_input: dict):
|
|||||||
json.dump(article, f, ensure_ascii=False, indent=4)
|
json.dump(article, f, ensure_ascii=False, indent=4)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
existing_urls.append(url)
|
existing_urls.append(article['url'])
|
||||||
|
|
||||||
if not insights:
|
if not insights:
|
||||||
continue
|
continue
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from ..llms.openai_wrapper import openai_llm
|
from llms.openai_wrapper import openai_llm
|
||||||
# from ..llms.siliconflow_wrapper import sfa_llm
|
# from llms.siliconflow_wrapper import sfa_llm
|
||||||
import re
|
import re
|
||||||
from ..utils.general_utils import get_logger_level
|
from utils.general_utils import get_logger_level
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from ..utils.pb_api import PbTalker
|
from utils.pb_api import PbTalker
|
||||||
import os
|
import os
|
||||||
import locale
|
import locale
|
||||||
|
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
# for developer
|
# for developer
|
||||||
|
|
||||||
如果你只是用户,无需关注这个文件夹。
|
download https://pocketbase.io/docs/
|
||||||
|
|
||||||
对于python开发者,请使用 backend/pb_api.py 模块进行数据库操作
|
|
||||||
|
|
||||||
对于js开发者,可以直接启动数据库后,在数据库各个collection页面中的api详情查看接口说明
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd pb
|
cd pb
|
||||||
./pocketbase --dev admin create test@example.com 123467890 #如果没有初始账号,请用这个命令创建
|
xattr -d com.apple.quarantine pocketbase # for Macos
|
||||||
|
./pocketbase migrate up # for first run
|
||||||
|
./pocketbase --dev admin create test@example.com 123467890 # If you don't have an initial account, please use this command to create it
|
||||||
./pocketbase serve
|
./pocketbase serve
|
||||||
```
|
```
|
@ -1,135 +0,0 @@
|
|||||||
/// <reference path="../pb_data/types.d.ts" />
|
|
||||||
migrate((db) => {
|
|
||||||
const collection = new Collection({
|
|
||||||
"id": "4rpge043645sp4j",
|
|
||||||
"created": "2024-04-17 02:46:25.373Z",
|
|
||||||
"updated": "2024-04-17 02:46:25.373Z",
|
|
||||||
"name": "roleplays",
|
|
||||||
"type": "base",
|
|
||||||
"system": false,
|
|
||||||
"schema": [
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "ixk4pwsb",
|
|
||||||
"name": "activated",
|
|
||||||
"type": "bool",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "tmak73c7",
|
|
||||||
"name": "character",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "6iuxuwhb",
|
|
||||||
"name": "focus",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "axmc2huy",
|
|
||||||
"name": "focus_type",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "gop61pjt",
|
|
||||||
"name": "good_sample1",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "qmy5cofa",
|
|
||||||
"name": "good_sample2",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "h8gafaci",
|
|
||||||
"name": "bad_sample",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"system": false,
|
|
||||||
"id": "m2ug5sfd",
|
|
||||||
"name": "report_type",
|
|
||||||
"type": "text",
|
|
||||||
"required": false,
|
|
||||||
"presentable": false,
|
|
||||||
"unique": false,
|
|
||||||
"options": {
|
|
||||||
"min": null,
|
|
||||||
"max": null,
|
|
||||||
"pattern": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"indexes": [],
|
|
||||||
"listRule": null,
|
|
||||||
"viewRule": null,
|
|
||||||
"createRule": null,
|
|
||||||
"updateRule": null,
|
|
||||||
"deleteRule": null,
|
|
||||||
"options": {}
|
|
||||||
});
|
|
||||||
|
|
||||||
return Dao(db).saveCollection(collection);
|
|
||||||
}, (db) => {
|
|
||||||
const dao = new Dao(db);
|
|
||||||
const collection = dao.findCollectionByNameOrId("4rpge043645sp4j");
|
|
||||||
|
|
||||||
return dao.deleteCollection(collection);
|
|
||||||
})
|
|
@ -5,4 +5,6 @@ gne
|
|||||||
jieba
|
jieba
|
||||||
httpx
|
httpx
|
||||||
chardet
|
chardet
|
||||||
websockets
|
pocketbase
|
||||||
|
pydantic
|
||||||
|
uvicorn
|
@ -1,33 +1,33 @@
|
|||||||
**这个文件夹下可以放置对应特定信源的爬虫,注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
|
> **This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
|
||||||
|
>
|
||||||
# 专有爬虫配置
|
> # Custom Crawler Configuration
|
||||||
|
>
|
||||||
写好爬虫后,将爬虫程序放在这个文件夹,并在__init__.py下的scraper_map中注册爬虫,类似:
|
> After writing the crawler, place the crawler program in this folder and register it in the scraper_map in `__init__.py`, similar to:
|
||||||
|
>
|
||||||
```python
|
> ```python
|
||||||
{'www.securityaffairs.com': securityaffairs_scraper}
|
> {'www.securityaffairs.com': securityaffairs_scraper}
|
||||||
```
|
> ```
|
||||||
|
>
|
||||||
其中key就是信源地址,value是函数名
|
> Here, the key is the source URL, and the value is the function name.
|
||||||
|
>
|
||||||
爬虫应该写为函数形式,出入参约定为:
|
> The crawler should be written in the form of a function with the following input and output specifications:
|
||||||
|
>
|
||||||
输入:
|
> Input:
|
||||||
- expiration: datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
|
> - expiration: A `datetime.date` object, the crawler should only fetch articles on or after this date.
|
||||||
- existings:[str], 数据库已有文章的url列表,爬虫应该忽略这个列表里面的url
|
> - existings: [str], a list of URLs of articles already in the database. The crawler should ignore the URLs in this list.
|
||||||
|
>
|
||||||
输出:
|
> Output:
|
||||||
- [dict],返回结果列表,每个dict代表一个文章,格式如下:
|
> - [dict], a list of result dictionaries, each representing an article, formatted as follows:
|
||||||
`[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
||||||
|
>
|
||||||
注意:publish_time格式为`"%Y%m%d"`, 如果爬虫抓不到可以用当天日期
|
> Note: The format of `publish_time` should be `"%Y%m%d"`. If the crawler cannot fetch it, the current date can be used.
|
||||||
|
>
|
||||||
另外,title和content是必须要有的
|
> Additionally, `title` and `content` are mandatory fields.
|
||||||
|
>
|
||||||
# 通用页面解析器
|
> # Generic Page Parser
|
||||||
|
>
|
||||||
我们这里提供了一个通用页面解析器,该解析器可以智能获取信源文章列表,接下来对于每一个文章url,会先尝试使用 gne 进行解析,如果失败的话,再尝试使用llm进行解析。
|
> We provide a generic page parser here, which can intelligently fetch article lists from the source. For each article URL, it will first attempt to parse using gne. If it fails, it will then attempt to parse using llm.
|
||||||
|
>
|
||||||
通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
|
> Through this solution, it is possible to scan and extract information from most general news and portal sources.
|
||||||
|
>
|
||||||
**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务,以实现更加理想且更加高效的扫描。**
|
> **However, we still strongly recommend that users write custom crawlers themselves or directly subscribe to our data service for more ideal and efficient scanning.**
|
||||||
|
33
core/scrapers/README_CN.md
Normal file
33
core/scrapers/README_CN.md
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
**这个文件夹下可以放置对应特定信源的爬虫,注意这里的爬虫应该是可以解析信源文章列表url并返回文章详情dict的**
|
||||||
|
|
||||||
|
# 专有爬虫配置
|
||||||
|
|
||||||
|
写好爬虫后,将爬虫程序放在这个文件夹,并在__init__.py下的scraper_map中注册爬虫,类似:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{'www.securityaffairs.com': securityaffairs_scraper}
|
||||||
|
```
|
||||||
|
|
||||||
|
其中key就是信源地址,value是函数名
|
||||||
|
|
||||||
|
爬虫应该写为函数形式,出入参约定为:
|
||||||
|
|
||||||
|
输入:
|
||||||
|
- expiration: datetime的date.date()对象,爬虫应该只抓取这之后(含这一天)的文章
|
||||||
|
- existings:[str], 数据库已有文章的url列表,爬虫应该忽略这个列表里面的url
|
||||||
|
|
||||||
|
输出:
|
||||||
|
- [dict],返回结果列表,每个dict代表一个文章,格式如下:
|
||||||
|
`[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
||||||
|
|
||||||
|
注意:publish_time格式为`"%Y%m%d"`, 如果爬虫抓不到可以用当天日期
|
||||||
|
|
||||||
|
另外,title和content是必须要有的
|
||||||
|
|
||||||
|
# 通用页面解析器
|
||||||
|
|
||||||
|
我们这里提供了一个通用页面解析器,该解析器可以智能获取信源文章列表,接下来对于每一个文章url,会先尝试使用 gne 进行解析,如果失败的话,再尝试使用llm进行解析。
|
||||||
|
|
||||||
|
通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。
|
||||||
|
|
||||||
|
**然而我们依然强烈建议用户自行写专有爬虫或者直接订阅我们的数据服务,以实现更加理想且更加高效的扫描。**
|
34
core/scrapers/README_de.md
Normal file
34
core/scrapers/README_de.md
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
|
||||||
|
> **In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
|
||||||
|
>
|
||||||
|
> # Konfiguration des benutzerdefinierten Crawlers
|
||||||
|
>
|
||||||
|
> Nachdem Sie den Crawler geschrieben haben, platzieren Sie das Crawler-Programm in diesem Ordner und registrieren Sie es in scraper_map in `__init__.py`, ähnlich wie:
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> {'www.securityaffairs.com': securityaffairs_scraper}
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> Hier ist der Schlüssel die URL der Quelle und der Wert der Funktionsname.
|
||||||
|
>
|
||||||
|
> Der Crawler sollte in Form einer Funktion geschrieben werden, mit den folgenden Eingabe- und Ausgabeparametern:
|
||||||
|
>
|
||||||
|
> Eingabe:
|
||||||
|
> - expiration: Ein `datetime.date` Objekt, der Crawler sollte nur Artikel ab diesem Datum (einschließlich) abrufen.
|
||||||
|
> - existings: [str], eine Liste von URLs von Artikeln, die bereits in der Datenbank vorhanden sind. Der Crawler sollte die URLs in dieser Liste ignorieren.
|
||||||
|
>
|
||||||
|
> Ausgabe:
|
||||||
|
> - [dict], eine Liste von Ergebnis-Wörterbüchern, wobei jedes Wörterbuch einen Artikel darstellt, formatiert wie folgt:
|
||||||
|
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
||||||
|
>
|
||||||
|
> Hinweis: Das Format von `publish_time` sollte `"%Y%m%d"` sein. Wenn der Crawler es nicht abrufen kann, kann das aktuelle Datum verwendet werden.
|
||||||
|
>
|
||||||
|
> Darüber hinaus sind `title` und `content` Pflichtfelder.
|
||||||
|
>
|
||||||
|
> # Generischer Seitenparser
|
||||||
|
>
|
||||||
|
> Wir bieten hier einen generischen Seitenparser an, der intelligent Artikellisten von der Quelle abrufen kann. Für jede Artikel-URL wird zunächst versucht, mit gne zu parsen. Scheitert dies, wird versucht, mit llm zu parsen.
|
||||||
|
>
|
||||||
|
> Durch diese Lösung ist es möglich, die meisten allgemeinen Nachrichtenquellen und Portale zu scannen und Informationen zu extrahieren.
|
||||||
|
>
|
||||||
|
> **Wir empfehlen jedoch dringend, dass Benutzer eigene benutzerdefinierte Crawlers schreiben oder direkt unseren Datenservice abonnieren, um eine idealere und effizientere Erfassung zu erreichen.**
|
34
core/scrapers/README_fr.md
Normal file
34
core/scrapers/README_fr.md
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
|
||||||
|
> **Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
|
||||||
|
>
|
||||||
|
> # Configuration du Crawler Personnalisé
|
||||||
|
>
|
||||||
|
> Après avoir écrit le crawler, placez le programme du crawler dans ce dossier et enregistrez-le dans scraper_map dans `__init__.py`, comme suit :
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> {'www.securityaffairs.com': securityaffairs_scraper}
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> Ici, la clé est l'URL de la source, et la valeur est le nom de la fonction.
|
||||||
|
>
|
||||||
|
> Le crawler doit être écrit sous forme de fonction avec les spécifications suivantes pour les entrées et sorties :
|
||||||
|
>
|
||||||
|
> Entrée :
|
||||||
|
> - expiration : Un objet `datetime.date`, le crawler ne doit récupérer que les articles à partir de cette date (incluse).
|
||||||
|
> - existings : [str], une liste d'URLs d'articles déjà présents dans la base de données. Le crawler doit ignorer les URLs de cette liste.
|
||||||
|
>
|
||||||
|
> Sortie :
|
||||||
|
> - [dict], une liste de dictionnaires de résultats, chaque dictionnaire représentant un article, formaté comme suit :
|
||||||
|
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
||||||
|
>
|
||||||
|
> Remarque : Le format de `publish_time` doit être `"%Y%m%d"`. Si le crawler ne peut pas le récupérer, la date du jour peut être utilisée.
|
||||||
|
>
|
||||||
|
> De plus, `title` et `content` sont des champs obligatoires.
|
||||||
|
>
|
||||||
|
> # Analyseur de Page Générique
|
||||||
|
>
|
||||||
|
> Nous fournissons ici un analyseur de page générique, qui peut récupérer intelligemment les listes d'articles de la source. Pour chaque URL d'article, il tentera d'abord de parser avec gne. En cas d'échec, il tentera de parser avec llm.
|
||||||
|
>
|
||||||
|
> Grâce à cette solution, il est possible de scanner et d'extraire des informations à partir de la plupart des sources de type actualités générales et portails.
|
||||||
|
>
|
||||||
|
> **Cependant, nous recommandons vivement aux utilisateurs de rédiger eux-mêmes des crawlers personnalisés ou de s'abonner directement à notre service de données pour un scan plus idéal et plus efficace.**
|
33
core/scrapers/README_jp.md
Normal file
33
core/scrapers/README_jp.md
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
**このフォルダには特定のソースに対応したクローラーを配置できます。ここでのクローラーはソースの記事リストURLを解析し、記事の詳細情報を辞書形式で返す必要があります。**
|
||||||
|
>
|
||||||
|
> # カスタムクローラーの設定
|
||||||
|
>
|
||||||
|
> クローラーを作成した後、そのプログラムをこのフォルダに配置し、`__init__.py` の scraper_map に次のように登録します:
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> {'www.securityaffairs.com': securityaffairs_scraper}
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ここで、キーはソースのURLで、値は関数名です。
|
||||||
|
>
|
||||||
|
> クローラーは関数形式で記述し、以下の入力および出力仕様を満たす必要があります:
|
||||||
|
>
|
||||||
|
> 入力:
|
||||||
|
> - expiration: `datetime.date` オブジェクト、クローラーはこの日付以降(この日を含む)の記事のみを取得する必要があります。
|
||||||
|
> - existings:[str]、データベースに既存する記事のURLリスト、クローラーはこのリスト内のURLを無視する必要があります。
|
||||||
|
>
|
||||||
|
> 出力:
|
||||||
|
> - [dict]、結果の辞書リスト、各辞書は以下の形式で1つの記事を表します:
|
||||||
|
> `[{'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [Path]}, {...}, ...]`
|
||||||
|
>
|
||||||
|
> 注意:`publish_time`の形式は`"%Y%m%d"`である必要があります。クローラーで取得できない場合は、当日の日付を使用できます。
|
||||||
|
>
|
||||||
|
> さらに、`title`と`content`は必須フィールドです。
|
||||||
|
>
|
||||||
|
> # 一般ページパーサー
|
||||||
|
>
|
||||||
|
> ここでは一般的なページパーサーを提供しており、ソースから記事リストをインテリジェントに取得できます。各記事URLに対して、最初に gne を使用して解析を試みます。失敗した場合は、llm を使用して解析を試みます。
|
||||||
|
>
|
||||||
|
> このソリューションにより、ほとんどの一般的なニュースおよびポータルソースのスキャンと情報抽出が可能になります。
|
||||||
|
>
|
||||||
|
> **しかし、より理想的かつ効率的なスキャンを実現するために、ユーザー自身でカスタムクローラーを作成するか、直接弊社のデータサービスを購読することを強くお勧めします。**
|
@ -1,6 +1,6 @@
|
|||||||
from .mp_crawler import mp_crawler
|
from .mp_crawler import mp_crawler
|
||||||
from .simple_crawler import simple_crawler
|
from .simple_crawler import simple_crawler
|
||||||
from .general_scraper import llm_crawler
|
from .general_scraper import general_scraper, llm_crawler
|
||||||
|
|
||||||
|
|
||||||
scraper_map = {}
|
scraper_map = {}
|
||||||
|
@ -1,16 +1,20 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import re
|
import re
|
||||||
from .simple_crawler import simple_crawler
|
from .simple_crawler import simple_crawler
|
||||||
|
from .mp_crawler import mp_crawler
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Comment
|
from bs4.element import Comment
|
||||||
from ..llms.openai_wrapper import openai_llm
|
from llms.openai_wrapper import openai_llm
|
||||||
# from ..llms.siliconflow_wrapper import sfa_llm
|
# from llms.siliconflow_wrapper import sfa_llm
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from requests.compat import urljoin
|
from requests.compat import urljoin
|
||||||
import chardet
|
import chardet
|
||||||
from ..utils.general_utils import extract_and_convert_dates
|
from utils.general_utils import extract_and_convert_dates
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
|
model = os.environ.get('HTML_PARSE_MODEL', 'gpt-3.5-turbo')
|
||||||
@ -63,7 +67,6 @@ def parse_html_content(out: str) -> dict:
|
|||||||
return dct
|
return dct
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
|
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
|
||||||
"""
|
"""
|
||||||
Title||Summary||Content||Release Date YYYY-MM-DD
|
Title||Summary||Content||Release Date YYYY-MM-DD
|
||||||
@ -71,17 +74,24 @@ Title||Summary||Content||Release Date YYYY-MM-DD
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
def llm_crawler(url: str, logger) -> (int, dict):
|
async def llm_crawler(url: str, logger) -> (int, dict):
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for retry in range(2):
|
||||||
try:
|
try:
|
||||||
with httpx.Client() as client:
|
response = await client.get(url, headers=header, timeout=30)
|
||||||
response = client.get(url, headers=header, timeout=30)
|
response.raise_for_status()
|
||||||
rawdata = response.content
|
break
|
||||||
encoding = chardet.detect(rawdata)['encoding']
|
|
||||||
text = rawdata.decode(encoding)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
if retry < 1:
|
||||||
|
logger.info(f"request {url} got error {e}\nwaiting 1min")
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
else:
|
||||||
|
logger.warning(f"request {url} got error {e}")
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
|
rawdata = response.content
|
||||||
|
encoding = chardet.detect(rawdata)['encoding']
|
||||||
|
text = rawdata.decode(encoding, errors='replace')
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
html_text = text_from_soup(soup)
|
html_text = text_from_soup(soup)
|
||||||
html_lines = html_text.split('\n')
|
html_lines = html_text.split('\n')
|
||||||
@ -91,7 +101,8 @@ def llm_crawler(url: str, logger) -> (int, dict):
|
|||||||
logger.warning(f"{url} content too long for llm parsing")
|
logger.warning(f"{url} content too long for llm parsing")
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
|
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
|
||||||
|
'您访问的页面') or html_text.startswith('403') \
|
||||||
or html_text.startswith('出错了'):
|
or html_text.startswith('出错了'):
|
||||||
logger.warning(f"can not get {url} from the Internet")
|
logger.warning(f"can not get {url} from the Internet")
|
||||||
return -7, {}
|
return -7, {}
|
||||||
@ -103,7 +114,7 @@ def llm_crawler(url: str, logger) -> (int, dict):
|
|||||||
llm_output = openai_llm(messages, model=model, logger=logger)
|
llm_output = openai_llm(messages, model=model, logger=logger)
|
||||||
try:
|
try:
|
||||||
info = parse_html_content(llm_output)
|
info = parse_html_content(llm_output)
|
||||||
except Exception:
|
except:
|
||||||
msg = f"can not parse {llm_output}"
|
msg = f"can not parse {llm_output}"
|
||||||
logger.debug(msg)
|
logger.debug(msg)
|
||||||
return 0, {}
|
return 0, {}
|
||||||
@ -146,31 +157,49 @@ def llm_crawler(url: str, logger) -> (int, dict):
|
|||||||
return 11, info
|
return 11, info
|
||||||
|
|
||||||
|
|
||||||
def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
|
async def general_scraper(site: str, expiration: date, existing: list[str], logger) -> list[dict]:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for retry in range(2):
|
||||||
try:
|
try:
|
||||||
with httpx.Client() as client:
|
response = await client.get(site, headers=header, timeout=30)
|
||||||
response = client.get(site, headers=header, timeout=30)
|
response.raise_for_status()
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
if retry < 1:
|
||||||
|
logger.info(f"request {site} got error {e}\nwaiting 1min")
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
else:
|
||||||
|
logger.warning(f"request {site} got error {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
page_source = response.text
|
page_source = response.text
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
# Parse all URLs
|
# Parse all URLs
|
||||||
parsed_url = urlparse(site)
|
parsed_url = urlparse(site)
|
||||||
base_url = parsed_url.scheme + '://' + parsed_url.netloc
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
|
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
|
||||||
|
|
||||||
if not urls:
|
if not urls:
|
||||||
# maybe it's an article site
|
# maybe it's an article site
|
||||||
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
|
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
|
||||||
if site in existing:
|
if site in existing:
|
||||||
logger.debug(f"{site} has been crawled before, skip it")
|
logger.debug(f"{site} has been crawled before, skip it")
|
||||||
return []
|
return []
|
||||||
flag, result = simple_crawler(site, logger)
|
|
||||||
|
if site.startswith('https://mp.weixin.qq.com') or site.startswith('http://mp.weixin.qq.com'):
|
||||||
|
flag, result = await mp_crawler(site, logger)
|
||||||
|
else:
|
||||||
|
flag, result = await simple_crawler(site, logger)
|
||||||
|
|
||||||
|
if flag == -7:
|
||||||
|
# -7 means cannot fetch the html, and other crawlers have no effect.
|
||||||
|
return []
|
||||||
|
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
flag, result = llm_crawler(site, logger)
|
flag, result = await llm_crawler(site, logger)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||||||
if publish_date.date() < expiration:
|
if publish_date.date() < expiration:
|
||||||
logger.debug(f"{site} is too old, skip it")
|
logger.debug(f"{site} is too old, skip it")
|
||||||
@ -183,12 +212,23 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger) ->
|
|||||||
if url in existing:
|
if url in existing:
|
||||||
logger.debug(f"{url} has been crawled before, skip it")
|
logger.debug(f"{url} has been crawled before, skip it")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
existing.append(url)
|
existing.append(url)
|
||||||
flag, result = simple_crawler(url, logger)
|
|
||||||
|
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
|
||||||
|
flag, result = await mp_crawler(url, logger)
|
||||||
|
else:
|
||||||
|
flag, result = await simple_crawler(url, logger)
|
||||||
|
|
||||||
|
if flag == -7:
|
||||||
|
# -7 means cannot fetch the html, and other crawlers have no effect.
|
||||||
|
continue
|
||||||
|
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
flag, result = llm_crawler(url, logger)
|
flag, result = await llm_crawler(url, logger)
|
||||||
if flag != 11:
|
if flag != 11:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
|
||||||
if publish_date.date() < expiration:
|
if publish_date.date() < expiration:
|
||||||
logger.debug(f"{url} is too old, skip it")
|
logger.debug(f"{url} is too old, skip it")
|
||||||
|
@ -1,25 +1,35 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
header = {
|
header = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||||
|
|
||||||
|
|
||||||
def mp_crawler(url: str, logger) -> (int, dict):
|
async def mp_crawler(url: str, logger) -> (int, dict):
|
||||||
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
||||||
logger.warning(f'{url} is not a mp url, you should not use this function')
|
logger.warning(f'{url} is not a mp url, you should not use this function')
|
||||||
return -5, {}
|
return -5, {}
|
||||||
|
|
||||||
url = url.replace("http://", "https://", 1)
|
url = url.replace("http://", "https://", 1)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for retry in range(2):
|
||||||
try:
|
try:
|
||||||
with httpx.Client() as client:
|
response = await client.get(url, headers=header, timeout=30)
|
||||||
response = client.get(url, headers=header, timeout=30)
|
response.raise_for_status()
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"cannot get content from {url}\n{e}")
|
if retry < 1:
|
||||||
|
logger.info(f"request {url} got error {e}\nwaiting 1min")
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
else:
|
||||||
|
logger.warning(f"request {url} got error {e}")
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
@ -48,11 +58,11 @@ def mp_crawler(url: str, logger) -> (int, dict):
|
|||||||
else soup.find('div', class_='wx_follow_nickname').text.strip()
|
else soup.find('div', class_='wx_follow_nickname').text.strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"not mp format: {url}\n{e}")
|
logger.warning(f"not mp format: {url}\n{e}")
|
||||||
|
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
if not rich_media_title or not profile_nickname:
|
if not rich_media_title or not profile_nickname:
|
||||||
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
||||||
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
# Parse text and image links within the content interval
|
# Parse text and image links within the content interval
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from gne import GeneralNewsExtractor
|
from gne import GeneralNewsExtractor
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from ..utils.general_utils import extract_and_convert_dates
|
from utils.general_utils import extract_and_convert_dates
|
||||||
import chardet
|
import chardet
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
extractor = GeneralNewsExtractor()
|
extractor = GeneralNewsExtractor()
|
||||||
@ -12,21 +15,28 @@ header = {
|
|||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||||
|
|
||||||
|
|
||||||
def simple_crawler(url: str, logger) -> (int, dict):
|
async def simple_crawler(url: str, logger) -> (int, dict):
|
||||||
"""
|
"""
|
||||||
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
|
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
|
||||||
"""
|
"""
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for retry in range(2):
|
||||||
try:
|
try:
|
||||||
with httpx.Client() as client:
|
response = await client.get(url, headers=header, timeout=30)
|
||||||
response = client.get(url, headers=header, timeout=30)
|
response.raise_for_status()
|
||||||
rawdata = response.content
|
break
|
||||||
encoding = chardet.detect(rawdata)['encoding']
|
|
||||||
text = rawdata.decode(encoding)
|
|
||||||
result = extractor.extract(text)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"cannot get content from {url}\n{e}")
|
if retry < 1:
|
||||||
|
logger.info(f"request {url} got error {e}\nwaiting 1min")
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
else:
|
||||||
|
logger.warning(f"request {url} got error {e}")
|
||||||
return -7, {}
|
return -7, {}
|
||||||
|
|
||||||
|
rawdata = response.content
|
||||||
|
encoding = chardet.detect(rawdata)['encoding']
|
||||||
|
text = rawdata.decode(encoding, errors='replace')
|
||||||
|
result = extractor.extract(text)
|
||||||
if not result:
|
if not result:
|
||||||
logger.error(f"gne cannot extract {url}")
|
logger.error(f"gne cannot extract {url}")
|
||||||
return 0, {}
|
return 0, {}
|
||||||
@ -35,7 +45,8 @@ def simple_crawler(url: str, logger) -> (int, dict):
|
|||||||
logger.info(f"{result} not valid")
|
logger.info(f"{result} not valid")
|
||||||
return 0, {}
|
return 0, {}
|
||||||
|
|
||||||
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
|
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[
|
||||||
|
'title'].startswith('403') \
|
||||||
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
|
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
|
||||||
logger.warning(f"can not get {url} from the Internet")
|
logger.warning(f"can not get {url} from the Internet")
|
||||||
return -7, {}
|
return -7, {}
|
||||||
@ -58,8 +69,9 @@ def simple_crawler(url: str, logger) -> (int, dict):
|
|||||||
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
|
result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}"
|
||||||
else:
|
else:
|
||||||
result['abstract'] = ''
|
result['abstract'] = ''
|
||||||
except Exception:
|
except:
|
||||||
result['abstract'] = ''
|
result['abstract'] = ''
|
||||||
|
|
||||||
result['url'] = url
|
result['url'] = url
|
||||||
|
|
||||||
return 11, result
|
return 11, result
|
||||||
|
@ -1,9 +1,4 @@
|
|||||||
"""mostly copy from https://github.com/netease-youdao/QAnything
|
|
||||||
awsome work!
|
|
||||||
"""
|
|
||||||
# import traceback
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import time
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import jieba
|
import jieba
|
||||||
@ -18,131 +13,39 @@ def extract_urls(text):
|
|||||||
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
|
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:.;]+[-A-Za-z0-9+&@#/%=~_|]')
|
||||||
urls = re.findall(url_pattern, text)
|
urls = re.findall(url_pattern, text)
|
||||||
|
|
||||||
# 过滤掉那些只匹配到 'www.' 而没有后续内容的情况,并尝试为每个URL添加默认的http协议前缀以便解析
|
# Filter out those cases that only match to'www. 'without subsequent content,
|
||||||
|
# and try to add the default http protocol prefix to each URL for easy parsing
|
||||||
cleaned_urls = [url for url in urls if isURL(url)]
|
cleaned_urls = [url for url in urls if isURL(url)]
|
||||||
return cleaned_urls
|
return cleaned_urls
|
||||||
|
|
||||||
|
|
||||||
def isChinesePunctuation(char):
|
def isChinesePunctuation(char):
|
||||||
# 定义中文标点符号的Unicode编码范围
|
# Define the Unicode encoding range for Chinese punctuation marks
|
||||||
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
|
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
|
||||||
# 检查字符是否在上述范围内
|
# Check if the character is within the above range
|
||||||
return ord(char) in chinese_punctuations
|
return ord(char) in chinese_punctuations
|
||||||
|
|
||||||
|
|
||||||
def get_time(func):
|
|
||||||
def inner(*arg, **kwargs):
|
|
||||||
s_time = time.time()
|
|
||||||
res = func(*arg, **kwargs)
|
|
||||||
e_time = time.time()
|
|
||||||
print('函数 {} 执行耗时: {} 秒'.format(func.__name__, e_time - s_time))
|
|
||||||
return res
|
|
||||||
return inner
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
def safe_get(req: Request, attr: str, default=None):
|
|
||||||
try:
|
|
||||||
if attr in req.form:
|
|
||||||
return req.form.getlist(attr)[0]
|
|
||||||
if attr in req.args:
|
|
||||||
return req.args[attr]
|
|
||||||
if attr in req.json:
|
|
||||||
return req.json[attr]
|
|
||||||
# if value := req.form.get(attr):
|
|
||||||
# return value
|
|
||||||
# if value := req.args.get(attr):
|
|
||||||
# return value
|
|
||||||
# """req.json执行时不校验content-type,body字段可能不能被正确解析为json"""
|
|
||||||
# if value := req.json.get(attr):
|
|
||||||
# return value
|
|
||||||
except BadRequest:
|
|
||||||
logging.warning(f"missing {attr} in request")
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"get {attr} from request failed:")
|
|
||||||
logging.warning(traceback.format_exc())
|
|
||||||
return default
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def truncate_filename(filename, max_length=200):
|
|
||||||
# 获取文件名后缀
|
|
||||||
file_ext = os.path.splitext(filename)[1]
|
|
||||||
|
|
||||||
# 获取不带后缀的文件名
|
|
||||||
file_name_no_ext = os.path.splitext(filename)[0]
|
|
||||||
|
|
||||||
# 计算文件名长度,注意中文字符
|
|
||||||
filename_length = len(filename.encode('utf-8'))
|
|
||||||
|
|
||||||
# 如果文件名长度超过最大长度限制
|
|
||||||
if filename_length > max_length:
|
|
||||||
# 生成一个时间戳标记
|
|
||||||
timestamp = str(int(time.time()))
|
|
||||||
|
|
||||||
# 计算剩余的文件名长度
|
|
||||||
remaining_length = max_length - len(file_ext) - len(timestamp) - 1 # -1 是为了下划线
|
|
||||||
|
|
||||||
# 截取文件名并添加标记
|
|
||||||
file_name_no_ext = file_name_no_ext[:remaining_length]
|
|
||||||
new_filename = file_name_no_ext + '_' + timestamp + file_ext
|
|
||||||
else:
|
|
||||||
new_filename = filename
|
|
||||||
|
|
||||||
return new_filename
|
|
||||||
|
|
||||||
|
|
||||||
def read_files_with_extensions():
|
|
||||||
# 获取当前脚本文件的路径
|
|
||||||
current_file = os.path.abspath(__file__)
|
|
||||||
|
|
||||||
# 获取当前脚本文件所在的目录
|
|
||||||
current_dir = os.path.dirname(current_file)
|
|
||||||
|
|
||||||
# 获取项目根目录
|
|
||||||
project_dir = os.path.dirname(current_dir)
|
|
||||||
|
|
||||||
directory = project_dir + '/data'
|
|
||||||
print(f'now reading {directory}')
|
|
||||||
extensions = ['.md', '.txt', '.pdf', '.jpg', '.docx', '.xlsx', '.eml', '.csv']
|
|
||||||
for root, dirs, files in os.walk(directory):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith(tuple(extensions)):
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
yield file_path
|
|
||||||
|
|
||||||
|
|
||||||
def validate_user_id(user_id):
|
|
||||||
# 定义正则表达式模式
|
|
||||||
pattern = r'^[A-Za-z][A-Za-z0-9_]*$'
|
|
||||||
# 检查是否匹配
|
|
||||||
if isinstance(user_id, str) and re.match(pattern, user_id):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_chinese(string):
|
def is_chinese(string):
|
||||||
"""
|
"""
|
||||||
使用火山引擎其实可以支持更加广泛的语言检测,未来可以考虑 https://www.volcengine.com/docs/4640/65066
|
:param string: {str} The string to be detected
|
||||||
判断字符串中大部分是否是中文
|
:return: {bool} Returns True if most are Chinese, False otherwise
|
||||||
:param string: {str} 需要检测的字符串
|
|
||||||
:return: {bool} 如果大部分是中文返回True,否则返回False
|
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r'[^\u4e00-\u9fa5]')
|
pattern = re.compile(r'[^\u4e00-\u9fa5]')
|
||||||
non_chinese_count = len(pattern.findall(string))
|
non_chinese_count = len(pattern.findall(string))
|
||||||
# 严格按照字节数量小于一半判断容易误判,英文单词占字节较大,且还有标点符号等
|
# It is easy to misjudge strictly according to the number of bytes less than half.
|
||||||
|
# English words account for a large number of bytes, and there are punctuation marks, etc
|
||||||
return (non_chinese_count/len(string)) < 0.68
|
return (non_chinese_count/len(string)) < 0.68
|
||||||
|
|
||||||
|
|
||||||
def extract_and_convert_dates(input_string):
|
def extract_and_convert_dates(input_string):
|
||||||
# 定义匹配不同日期格式的正则表达式
|
# 定义匹配不同日期格式的正则表达式
|
||||||
patterns = [
|
patterns = [
|
||||||
r'(\d{4})-(\d{2})-(\d{2})', # 匹配YYYY-MM-DD格式
|
r'(\d{4})-(\d{2})-(\d{2})', # YYYY-MM-DD
|
||||||
r'(\d{4})/(\d{2})/(\d{2})', # 匹配YYYY/MM/DD格式
|
r'(\d{4})/(\d{2})/(\d{2})', # YYYY/MM/DD
|
||||||
r'(\d{4})\.(\d{2})\.(\d{2})', # 匹配YYYY.MM.DD格式
|
r'(\d{4})\.(\d{2})\.(\d{2})', # YYYY.MM.DD
|
||||||
r'(\d{4})\\(\d{2})\\(\d{2})', # 匹配YYYY\MM\DD格式
|
r'(\d{4})\\(\d{2})\\(\d{2})', # YYYY\MM\DD
|
||||||
r'(\d{4})(\d{2})(\d{2})' # 匹配YYYYMMDD格式
|
r'(\d{4})(\d{2})(\d{2})' # YYYYMMDD
|
||||||
]
|
]
|
||||||
|
|
||||||
matches = []
|
matches = []
|
||||||
@ -174,62 +77,21 @@ def get_logger_level() -> str:
|
|||||||
|
|
||||||
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
|
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
|
||||||
"""
|
"""
|
||||||
比较一个目标短语与短语列表中每个短语的相似度。
|
Compare the similarity of a target phrase to each phrase in the phrase list.
|
||||||
|
|
||||||
:param target_phrase: 目标短语 (str)
|
: Param target_phrase: target phrase (str)
|
||||||
:param phrase_list: 短语列表 (list of str)
|
: Param phrase_list: list of str
|
||||||
:param threshold: 相似度阈值 (float)
|
: param threshold: similarity threshold (float)
|
||||||
:return: 满足相似度条件的短语列表 (list of str)
|
: Return: list of phrases that satisfy the similarity condition (list of str)
|
||||||
"""
|
"""
|
||||||
# 检查目标短语是否为空
|
|
||||||
if not target_phrase:
|
if not target_phrase:
|
||||||
return [] # 目标短语为空,直接返回空列表
|
return [] # The target phrase is empty, and the empty list is returned directly.
|
||||||
|
|
||||||
# 预处理:对目标短语和短语列表中的每个短语进行分词
|
# Preprocessing: Segmentation of the target phrase and each phrase in the phrase list
|
||||||
target_tokens = set(jieba.lcut(target_phrase))
|
target_tokens = set(jieba.lcut(target_phrase))
|
||||||
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
|
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
|
||||||
|
|
||||||
# 比较并筛选
|
|
||||||
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
|
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
|
||||||
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
|
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
|
||||||
|
|
||||||
return similar_phrases
|
return similar_phrases
|
||||||
|
|
||||||
"""
|
|
||||||
# from InternLM/huixiangdou
|
|
||||||
# another awsome work
|
|
||||||
def process_strings(self, str1, replacement, str2):
|
|
||||||
'''Find the longest common suffix of str1 and prefix of str2.'''
|
|
||||||
shared_substring = ''
|
|
||||||
for i in range(1, min(len(str1), len(str2)) + 1):
|
|
||||||
if str1[-i:] == str2[:i]:
|
|
||||||
shared_substring = str1[-i:]
|
|
||||||
|
|
||||||
# If there is a common substring, replace one of them with the replacement string and concatenate # noqa E501
|
|
||||||
if shared_substring:
|
|
||||||
return str1[:-len(shared_substring)] + replacement + str2
|
|
||||||
|
|
||||||
# Otherwise, just return str1 + str2
|
|
||||||
return str1 + str2
|
|
||||||
|
|
||||||
def clean_md(self, text: str):
|
|
||||||
'''Remove parts of the markdown document that do not contain the key
|
|
||||||
question words, such as code blocks, URL links, etc.'''
|
|
||||||
# remove ref
|
|
||||||
pattern_ref = r'\[(.*?)\]\(.*?\)'
|
|
||||||
new_text = re.sub(pattern_ref, r'\1', text)
|
|
||||||
|
|
||||||
# remove code block
|
|
||||||
pattern_code = r'```.*?```'
|
|
||||||
new_text = re.sub(pattern_code, '', new_text, flags=re.DOTALL)
|
|
||||||
|
|
||||||
# remove underline
|
|
||||||
new_text = re.sub('_{5,}', '', new_text)
|
|
||||||
|
|
||||||
# remove table
|
|
||||||
# new_text = re.sub('\|.*?\|\n\| *\:.*\: *\|.*\n(\|.*\|.*\n)*', '', new_text, flags=re.DOTALL) # noqa E501
|
|
||||||
|
|
||||||
# use lower
|
|
||||||
new_text = new_text.lower()
|
|
||||||
return new_text
|
|
||||||
"""
|
|
@ -7,7 +7,7 @@ from typing import BinaryIO
|
|||||||
class PbTalker:
|
class PbTalker:
|
||||||
def __init__(self, logger) -> None:
|
def __init__(self, logger) -> None:
|
||||||
# 1. base initialization
|
# 1. base initialization
|
||||||
url = "http://127.0.0.1:5882"
|
url = os.environ.get('PB_API_BASE', "http://127.0.0.1:8090")
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.logger.debug(f"initializing pocketbase client: {url}")
|
self.logger.debug(f"initializing pocketbase client: {url}")
|
||||||
self.client = PocketBase(url)
|
self.client = PocketBase(url)
|
||||||
@ -82,7 +82,7 @@ class PbTalker:
|
|||||||
|
|
||||||
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
|
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
|
||||||
try:
|
try:
|
||||||
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
|
res = self.client.collection(collection_name).get_one(item_id, {"fields": ','.join(fields) if fields else ''})
|
||||||
return vars(res)
|
return vars(res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"pocketbase view item failed: {e}")
|
self.logger.error(f"pocketbase view item failed: {e}")
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from backend.llms.dashscope_wrapper import dashscope_llm
|
from core.backend import dashscope_llm
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.oxml.ns import qn
|
from docx.oxml.ns import qn
|
||||||
from docx.shared import Pt, RGBColor
|
from docx.shared import Pt, RGBColor
|
||||||
|
10
env_sample
Executable file
10
env_sample
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
export LLM_API_KEY=""
|
||||||
|
export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper
|
||||||
|
##strongly recommended to use the following model provided by siliconflow (combined effect and price)
|
||||||
|
export GET_INFO_MODEL="zhipuai/glm4-9B-chat"
|
||||||
|
export REWRITE_MODEL="alibaba/Qwen2-7B-Instruct"
|
||||||
|
export HTML_PARSE_MODEL="deepseek-ai/deepseek-v2-chat"
|
||||||
|
export PROJECT_DIR="work_dir
|
||||||
|
export PB_API_AUTH="test@example.com|123467890"
|
||||||
|
export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
|
||||||
|
export WS_LOG="verbose" ##for detail log info. If not need, just delete this item.
|
89
pb_api.py
89
pb_api.py
@ -1,89 +0,0 @@
|
|||||||
import os
|
|
||||||
from pocketbase import PocketBase # Client also works the same
|
|
||||||
from pocketbase.client import FileUpload
|
|
||||||
from typing import BinaryIO
|
|
||||||
|
|
||||||
|
|
||||||
class PbTalker:
|
|
||||||
def __init__(self, logger) -> None:
|
|
||||||
# 1. base initialization
|
|
||||||
url = "http://127.0.0.1:5882"
|
|
||||||
self.logger = logger
|
|
||||||
self.logger.debug(f"initializing pocketbase client: {url}")
|
|
||||||
self.client = PocketBase(url)
|
|
||||||
auth = os.environ.get('PB_API_AUTH', '')
|
|
||||||
if not auth or "|" not in auth:
|
|
||||||
self.logger.warnning("invalid email|password found, will handle with not auth, make sure you have set the collection rule by anyone")
|
|
||||||
else:
|
|
||||||
email, password = auth.split('|')
|
|
||||||
try:
|
|
||||||
admin_data = self.client.admins.auth_with_password(email, password)
|
|
||||||
if admin_data:
|
|
||||||
self.logger.info(f"pocketbase ready authenticated as admin - {email}")
|
|
||||||
except:
|
|
||||||
user_data = self.client.collection("users").auth_with_password(email, password)
|
|
||||||
if user_data:
|
|
||||||
self.logger.info(f"pocketbase ready authenticated as user - {email}")
|
|
||||||
else:
|
|
||||||
raise Exception("pocketbase auth failed")
|
|
||||||
|
|
||||||
def read(self, collection_name: str, fields: list[str] = None, filter: str = '', skiptotal: bool = True) -> list:
|
|
||||||
results = []
|
|
||||||
for i in range(1, 10):
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).get_list(i, 500,
|
|
||||||
{"filter": filter,
|
|
||||||
"fields": ','.join(fields) if fields else '',
|
|
||||||
"skiptotal": skiptotal})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase get list failed: {e}")
|
|
||||||
continue
|
|
||||||
if not res.items:
|
|
||||||
break
|
|
||||||
for _res in res.items:
|
|
||||||
attributes = vars(_res)
|
|
||||||
results.append(attributes)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def add(self, collection_name: str, body: dict) -> str:
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).create(body)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase create failed: {e}")
|
|
||||||
return ''
|
|
||||||
return res.id
|
|
||||||
|
|
||||||
def update(self, collection_name: str, id: str, body: dict) -> str:
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).update(id, body)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase update failed: {e}")
|
|
||||||
return ''
|
|
||||||
return res.id
|
|
||||||
|
|
||||||
def delete(self, collection_name: str, id: str) -> str:
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).delete(id)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase update failed: {e}")
|
|
||||||
return 'failed'
|
|
||||||
if res:
|
|
||||||
return 'success'
|
|
||||||
return 'failed'
|
|
||||||
|
|
||||||
def upload(self, collection_name: str, id: str, key: str, file_name: str, file: BinaryIO) -> str:
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).update(id, {key: FileUpload((file_name, file))})
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase update failed: {e}")
|
|
||||||
return ''
|
|
||||||
return res.id
|
|
||||||
|
|
||||||
def view(self, collection_name: str, item_id: str, fields: list[str] = None) -> dict:
|
|
||||||
try:
|
|
||||||
res = self.client.collection(collection_name).get_one(item_id,{"fields": ','.join(fields) if fields else ''})
|
|
||||||
return vars(res)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"pocketbase view item failed: {e}")
|
|
||||||
return {}
|
|
Loading…
Reference in New Issue
Block a user