mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 10:50:25 +08:00
feat(core): update pb data sheet structure
This commit is contained in:
parent
3e4454a33b
commit
8c64749ba7
@ -23,7 +23,7 @@ class GeneralInfoExtractor:
|
|||||||
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
|
focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.'
|
||||||
'so please input one now. describe what info you care about shortly: ')
|
'so please input one now. describe what info you care about shortly: ')
|
||||||
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
|
explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ')
|
||||||
focus_data.append({"name": focus, "explaination": explanation,
|
focus_data.append({"name": focus, "explanation": explanation,
|
||||||
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
|
"id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})})
|
||||||
|
|
||||||
# self.focus_list = [item["focuspoint"] for item in focus_data]
|
# self.focus_list = [item["focuspoint"] for item in focus_data]
|
||||||
|
@ -23,10 +23,6 @@ wiseflow_logger = get_logger('general_process', f'{project_dir}/general_process.
|
|||||||
pb = PbTalker(wiseflow_logger)
|
pb = PbTalker(wiseflow_logger)
|
||||||
gie = GeneralInfoExtractor(pb, wiseflow_logger)
|
gie = GeneralInfoExtractor(pb, wiseflow_logger)
|
||||||
|
|
||||||
# Global variables
|
|
||||||
working_list = set()
|
|
||||||
existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']}
|
|
||||||
lock = asyncio.Lock()
|
|
||||||
|
|
||||||
async def save_to_pb(article: dict, infos: list):
|
async def save_to_pb(article: dict, infos: list):
|
||||||
# saving to pb process
|
# saving to pb process
|
||||||
@ -57,7 +53,9 @@ async def save_to_pb(article: dict, infos: list):
|
|||||||
|
|
||||||
|
|
||||||
async def pipeline(url: str):
|
async def pipeline(url: str):
|
||||||
global working_list, existing_urls
|
working_list = set()
|
||||||
|
existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']}
|
||||||
|
lock = asyncio.Lock()
|
||||||
working_list.add(url)
|
working_list.add(url)
|
||||||
crawler = PlaywrightCrawler(
|
crawler = PlaywrightCrawler(
|
||||||
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
||||||
@ -179,6 +177,9 @@ async def pipeline(url: str):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import asyncio
|
sites = pb.read('sites', filter='activated=True')
|
||||||
|
wiseflow_logger.info('execute all sites one time')
|
||||||
|
async def run_all_sites():
|
||||||
|
await asyncio.gather(*[pipeline(site['url'].rstrip('/')) for site in sites])
|
||||||
|
|
||||||
asyncio.run(pipeline())
|
asyncio.run(run_all_sites())
|
||||||
|
89
core/pb/pb_migrations/1733465276_created_sites.js
Normal file
89
core/pb/pb_migrations/1733465276_created_sites.js
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = new Collection({
|
||||||
|
"createRule": null,
|
||||||
|
"deleteRule": null,
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"autogeneratePattern": "[a-z0-9]{15}",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "text3208210256",
|
||||||
|
"max": 15,
|
||||||
|
"min": 15,
|
||||||
|
"name": "id",
|
||||||
|
"pattern": "^[a-z0-9]+$",
|
||||||
|
"presentable": false,
|
||||||
|
"primaryKey": true,
|
||||||
|
"required": true,
|
||||||
|
"system": true,
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"exceptDomains": [],
|
||||||
|
"hidden": false,
|
||||||
|
"id": "url4101391790",
|
||||||
|
"name": "url",
|
||||||
|
"onlyDomains": [],
|
||||||
|
"presentable": false,
|
||||||
|
"required": true,
|
||||||
|
"system": false,
|
||||||
|
"type": "url"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "number1152796692",
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"name": "per_hours",
|
||||||
|
"onlyInt": false,
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "bool806155165",
|
||||||
|
"name": "activated",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "bool"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate2990389176",
|
||||||
|
"name": "created",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": false,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate3332085495",
|
||||||
|
"name": "updated",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": true,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "pbc_2001081480",
|
||||||
|
"indexes": [],
|
||||||
|
"listRule": null,
|
||||||
|
"name": "sites",
|
||||||
|
"system": false,
|
||||||
|
"type": "base",
|
||||||
|
"updateRule": null,
|
||||||
|
"viewRule": null
|
||||||
|
});
|
||||||
|
|
||||||
|
return app.save(collection);
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_2001081480");
|
||||||
|
|
||||||
|
return app.delete(collection);
|
||||||
|
})
|
28
core/pb/pb_migrations/1733465426_updated_articles.js
Normal file
28
core/pb/pb_migrations/1733465426_updated_articles.js
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_4287850865")
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(6, new Field({
|
||||||
|
"cascadeDelete": false,
|
||||||
|
"collectionId": "pbc_3385864241",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "relation1874629670",
|
||||||
|
"maxSelect": 999,
|
||||||
|
"minSelect": 0,
|
||||||
|
"name": "tags",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "relation"
|
||||||
|
}))
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_4287850865")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("relation1874629670")
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
})
|
111
core/pb/pb_migrations/1733465563_created_infos.js
Normal file
111
core/pb/pb_migrations/1733465563_created_infos.js
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = new Collection({
|
||||||
|
"createRule": null,
|
||||||
|
"deleteRule": null,
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"autogeneratePattern": "[a-z0-9]{15}",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "text3208210256",
|
||||||
|
"max": 15,
|
||||||
|
"min": 15,
|
||||||
|
"name": "id",
|
||||||
|
"pattern": "^[a-z0-9]+$",
|
||||||
|
"presentable": false,
|
||||||
|
"primaryKey": true,
|
||||||
|
"required": true,
|
||||||
|
"system": true,
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"autogeneratePattern": "",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "text4274335913",
|
||||||
|
"max": 0,
|
||||||
|
"min": 0,
|
||||||
|
"name": "content",
|
||||||
|
"pattern": "",
|
||||||
|
"presentable": false,
|
||||||
|
"primaryKey": false,
|
||||||
|
"required": true,
|
||||||
|
"system": false,
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cascadeDelete": false,
|
||||||
|
"collectionId": "pbc_3385864241",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "relation59357059",
|
||||||
|
"maxSelect": 1,
|
||||||
|
"minSelect": 0,
|
||||||
|
"name": "tag",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "relation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cascadeDelete": false,
|
||||||
|
"collectionId": "pbc_4287850865",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "relation3218944360",
|
||||||
|
"maxSelect": 999,
|
||||||
|
"minSelect": 0,
|
||||||
|
"name": "articles",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "relation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "file3291445124",
|
||||||
|
"maxSelect": 1,
|
||||||
|
"maxSize": 0,
|
||||||
|
"mimeTypes": [],
|
||||||
|
"name": "report",
|
||||||
|
"presentable": false,
|
||||||
|
"protected": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"thumbs": [],
|
||||||
|
"type": "file"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate2990389176",
|
||||||
|
"name": "created",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": false,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate3332085495",
|
||||||
|
"name": "updated",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": true,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "pbc_629947526",
|
||||||
|
"indexes": [],
|
||||||
|
"listRule": null,
|
||||||
|
"name": "infos",
|
||||||
|
"system": false,
|
||||||
|
"type": "base",
|
||||||
|
"updateRule": null,
|
||||||
|
"viewRule": null
|
||||||
|
});
|
||||||
|
|
||||||
|
return app.save(collection);
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_629947526");
|
||||||
|
|
||||||
|
return app.delete(collection);
|
||||||
|
})
|
@ -1,5 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from agents import pipeline, pb, logger
|
from general_process import pipeline, pb, wiseflow_logger
|
||||||
|
|
||||||
counter = 1
|
counter = 1
|
||||||
|
|
||||||
@ -8,7 +8,7 @@ async def process_site(site, counter):
|
|||||||
if not site['per_hours'] or not site['url']:
|
if not site['per_hours'] or not site['url']:
|
||||||
return
|
return
|
||||||
if counter % site['per_hours'] == 0:
|
if counter % site['per_hours'] == 0:
|
||||||
logger.info(f"applying {site['url']}")
|
wiseflow_logger.info(f"applying {site['url']}")
|
||||||
await pipeline(site['url'].rstrip('/'))
|
await pipeline(site['url'].rstrip('/'))
|
||||||
|
|
||||||
|
|
||||||
@ -16,11 +16,11 @@ async def schedule_pipeline(interval):
|
|||||||
global counter
|
global counter
|
||||||
while True:
|
while True:
|
||||||
sites = pb.read('sites', filter='activated=True')
|
sites = pb.read('sites', filter='activated=True')
|
||||||
logger.info(f'task execute loop {counter}')
|
wiseflow_logger.info(f'task execute loop {counter}')
|
||||||
await asyncio.gather(*[process_site(site, counter) for site in sites])
|
await asyncio.gather(*[process_site(site, counter) for site in sites])
|
||||||
|
|
||||||
counter += 1
|
counter += 1
|
||||||
logger.info(f'task execute loop finished, work after {interval} seconds')
|
wiseflow_logger.info(f'task execute loop finished, work after {interval} seconds')
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
16
env_sample
16
env_sample
@ -1,10 +1,10 @@
|
|||||||
export LLM_API_KEY=""
|
export LLM_API_KEY=""
|
||||||
export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper
|
export LLM_API_BASE="https://api.siliconflow.cn/v1"
|
||||||
##strongly recommended to use the following model provided by siliconflow (consider both effect and price)
|
export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password
|
||||||
export GET_INFO_MODEL="THUDM/glm-4-9b-chat" ##
|
export VERBOSE="true" ##for detail log info. If not need, remove this item.
|
||||||
export REWRITE_MODEL="Qwen/Qwen2-7B-Instruct"
|
|
||||||
export HTML_PARSE_MODEL="aQwen/Qwen2-7B-Instruct"
|
##belowing is optional, go as you need
|
||||||
|
#export PRIMARY_MODEL="Qwen/Qwen2.5-14B-Instruct"
|
||||||
|
#export SECONDARY_MODEL="THUDM/glm-4-9b-chat"
|
||||||
export PROJECT_DIR="work_dir"
|
export PROJECT_DIR="work_dir"
|
||||||
export PB_API_AUTH="test@example.com|1234567890"
|
#export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
|
||||||
# export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090
|
|
||||||
export WS_LOG="verbose" ##for detail log info. If not need, just delete this item.
|
|
Loading…
Reference in New Issue
Block a user