feat(core): update pb data sheet structure

This commit is contained in:
bigbrother666sh 2024-12-06 14:14:28 +08:00
parent 3e4454a33b
commit 8c64749ba7
7 changed files with 249 additions and 20 deletions

View File

@ -23,7 +23,7 @@ class GeneralInfoExtractor:
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
'so please input one now. describe what info you care about shortly: ')
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
focus_data.append({"name": focus, "explaination": explanation,
focus_data.append({"name": focus, "explanation": explanation,
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
# self.focus_list = [item["focuspoint"] for item in focus_data]

View File

@ -23,10 +23,6 @@ wiseflow_logger = get_logger('general_process', f'{project_dir}/general_process.
pb = PbTalker(wiseflow_logger)
gie = GeneralInfoExtractor(pb, wiseflow_logger)
# Global variables
working_list = set()
existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']}
lock = asyncio.Lock()
async def save_to_pb(article: dict, infos: list):
# saving to pb process
@ -57,7 +53,9 @@ async def save_to_pb(article: dict, infos: list):
async def pipeline(url: str):
global working_list, existing_urls
working_list = set()
existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']}
lock = asyncio.Lock()
working_list.add(url)
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
@ -179,6 +177,9 @@ async def pipeline(url: str):
if __name__ == '__main__':
import asyncio
sites = pb.read('sites', filter='activated=True')
wiseflow_logger.info('execute all sites one time')
async def run_all_sites():
await asyncio.gather(*[pipeline(site['url'].rstrip('/')) for site in sites])
asyncio.run(pipeline())
asyncio.run(run_all_sites())

View File

@ -0,0 +1,89 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = new Collection({
"createRule": null,
"deleteRule": null,
"fields": [
{
"autogeneratePattern": "[a-z0-9]{15}",
"hidden": false,
"id": "text3208210256",
"max": 15,
"min": 15,
"name": "id",
"pattern": "^[a-z0-9]+$",
"presentable": false,
"primaryKey": true,
"required": true,
"system": true,
"type": "text"
},
{
"exceptDomains": [],
"hidden": false,
"id": "url4101391790",
"name": "url",
"onlyDomains": [],
"presentable": false,
"required": true,
"system": false,
"type": "url"
},
{
"hidden": false,
"id": "number1152796692",
"max": null,
"min": null,
"name": "per_hours",
"onlyInt": false,
"presentable": false,
"required": false,
"system": false,
"type": "number"
},
{
"hidden": false,
"id": "bool806155165",
"name": "activated",
"presentable": false,
"required": false,
"system": false,
"type": "bool"
},
{
"hidden": false,
"id": "autodate2990389176",
"name": "created",
"onCreate": true,
"onUpdate": false,
"presentable": false,
"system": false,
"type": "autodate"
},
{
"hidden": false,
"id": "autodate3332085495",
"name": "updated",
"onCreate": true,
"onUpdate": true,
"presentable": false,
"system": false,
"type": "autodate"
}
],
"id": "pbc_2001081480",
"indexes": [],
"listRule": null,
"name": "sites",
"system": false,
"type": "base",
"updateRule": null,
"viewRule": null
});
return app.save(collection);
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_2001081480");
return app.delete(collection);
})

View File

@ -0,0 +1,28 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = app.findCollectionByNameOrId("pbc_4287850865")
// add field
collection.fields.addAt(6, new Field({
"cascadeDelete": false,
"collectionId": "pbc_3385864241",
"hidden": false,
"id": "relation1874629670",
"maxSelect": 999,
"minSelect": 0,
"name": "tags",
"presentable": false,
"required": false,
"system": false,
"type": "relation"
}))
return app.save(collection)
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_4287850865")
// remove field
collection.fields.removeById("relation1874629670")
return app.save(collection)
})

View File

@ -0,0 +1,111 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = new Collection({
"createRule": null,
"deleteRule": null,
"fields": [
{
"autogeneratePattern": "[a-z0-9]{15}",
"hidden": false,
"id": "text3208210256",
"max": 15,
"min": 15,
"name": "id",
"pattern": "^[a-z0-9]+$",
"presentable": false,
"primaryKey": true,
"required": true,
"system": true,
"type": "text"
},
{
"autogeneratePattern": "",
"hidden": false,
"id": "text4274335913",
"max": 0,
"min": 0,
"name": "content",
"pattern": "",
"presentable": false,
"primaryKey": false,
"required": true,
"system": false,
"type": "text"
},
{
"cascadeDelete": false,
"collectionId": "pbc_3385864241",
"hidden": false,
"id": "relation59357059",
"maxSelect": 1,
"minSelect": 0,
"name": "tag",
"presentable": false,
"required": false,
"system": false,
"type": "relation"
},
{
"cascadeDelete": false,
"collectionId": "pbc_4287850865",
"hidden": false,
"id": "relation3218944360",
"maxSelect": 999,
"minSelect": 0,
"name": "articles",
"presentable": false,
"required": false,
"system": false,
"type": "relation"
},
{
"hidden": false,
"id": "file3291445124",
"maxSelect": 1,
"maxSize": 0,
"mimeTypes": [],
"name": "report",
"presentable": false,
"protected": false,
"required": false,
"system": false,
"thumbs": [],
"type": "file"
},
{
"hidden": false,
"id": "autodate2990389176",
"name": "created",
"onCreate": true,
"onUpdate": false,
"presentable": false,
"system": false,
"type": "autodate"
},
{
"hidden": false,
"id": "autodate3332085495",
"name": "updated",
"onCreate": true,
"onUpdate": true,
"presentable": false,
"system": false,
"type": "autodate"
}
],
"id": "pbc_629947526",
"indexes": [],
"listRule": null,
"name": "infos",
"system": false,
"type": "base",
"updateRule": null,
"viewRule": null
});
return app.save(collection);
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_629947526");
return app.delete(collection);
})

View File

@ -1,5 +1,5 @@
import asyncio
from agents import pipeline, pb, logger
from general_process import pipeline, pb, wiseflow_logger
counter = 1
@ -8,7 +8,7 @@ async def process_site(site, counter):
if not site['per_hours'] or not site['url']:
return
if counter % site['per_hours'] == 0:
logger.info(f"applying {site['url']}")
wiseflow_logger.info(f"applying {site['url']}")
await pipeline(site['url'].rstrip('/'))
@ -16,11 +16,11 @@ async def schedule_pipeline(interval):
global counter
while True:
sites = pb.read('sites', filter='activated=True')
logger.info(f'task execute loop {counter}')
wiseflow_logger.info(f'task execute loop {counter}')
await asyncio.gather(*[process_site(site, counter) for site in sites])
counter += 1
logger.info(f'task execute loop finished, work after {interval} seconds')
wiseflow_logger.info(f'task execute loop finished, work after {interval} seconds')
await asyncio.sleep(interval)

View File

@ -1,10 +1,10 @@
export LLM_API_KEY=""
export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper
##strongly recommended to use the following model provided by siliconflow (consider both effect and price)
export GET_INFO_MODEL="THUDM/glm-4-9b-chat" ##
export REWRITE_MODEL="Qwen/Qwen2-7B-Instruct"
export HTML_PARSE_MODEL="aQwen/Qwen2-7B-Instruct"
export LLM_API_BASE="https://api.siliconflow.cn/v1"
export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password
export VERBOSE="true" ##for detail log info. If not need, remove this item.
##belowing is optional, go as you need
#export PRIMARY_MODEL="Qwen/Qwen2.5-14B-Instruct"
#export SECONDARY_MODEL="THUDM/glm-4-9b-chat"
export PROJECT_DIR="work_dir"
export PB_API_AUTH="test@example.com|1234567890"
# export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
export WS_LOG="verbose" ##for detail log info. If not need, just delete this item.
#export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090