mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
rss and search study
This commit is contained in:
parent
7ac3b6f23e
commit
50332b1a09
@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
||||
if ratio > 0.05:
|
||||
if test_mode:
|
||||
print('this is a navigation section, will be removed')
|
||||
print(ratio)
|
||||
print(ratio, '\n')
|
||||
print(section_remain)
|
||||
print('-' * 50)
|
||||
sections = sections[1:]
|
||||
@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
||||
section_remain_len = len(section_remain)
|
||||
if section_remain_len < 198:
|
||||
if test_mode:
|
||||
print('this is a footer section, will be removed')
|
||||
print('this is a footer section, will be removed\n')
|
||||
print(section_remain_len)
|
||||
print(section_remain)
|
||||
print('-' * 50)
|
||||
@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
||||
if ratio < 70:
|
||||
if test_mode:
|
||||
print('this is a links part')
|
||||
print(ratio)
|
||||
print(ratio, '\n')
|
||||
print(text)
|
||||
print('-' * 50)
|
||||
links_parts.append(text)
|
||||
else:
|
||||
if test_mode:
|
||||
print('this is a content part')
|
||||
print(ratio)
|
||||
print(ratio, '\n')
|
||||
print(text)
|
||||
print('-' * 50)
|
||||
contents.append(text)
|
||||
|
54
core/connects/__init__.py
Normal file
54
core/connects/__init__.py
Normal file
@ -0,0 +1,54 @@
|
||||
from exa_search import search_with_exa
|
||||
import time
|
||||
from pprint import pprint
|
||||
import requests
|
||||
import uuid
|
||||
|
||||
api_key = ''
|
||||
|
||||
def run_v4_sync(query: str):
|
||||
msg = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": query
|
||||
}
|
||||
]
|
||||
tool = "web-search-pro"
|
||||
url = "https://open.bigmodel.cn/api/paas/v4/tools"
|
||||
request_id = str(uuid.uuid4())
|
||||
data = {
|
||||
"request_id": request_id,
|
||||
"tool": tool,
|
||||
"stream": False,
|
||||
"messages": msg
|
||||
}
|
||||
|
||||
resp = requests.post(
|
||||
url,
|
||||
json=data,
|
||||
headers={'Authorization': api_key},
|
||||
timeout=300
|
||||
)
|
||||
result = resp.json()
|
||||
return result['choices'][0]['message']
|
||||
|
||||
|
||||
test_list = ['广东全省的台风预警——仅限2024年的信息',
|
||||
'大模型技术突破与创新——包括新算法与模型,新的研究成果',
|
||||
'事件图谱方面的知识',
|
||||
'人工智能领军人物介绍',
|
||||
'社区治理',
|
||||
'新获批的氢能项目——60万吨级别以上',
|
||||
'氢能项目招标信息——2024年12月以后',
|
||||
'各地住宅网签最新数据——2025年1月6日以后']
|
||||
|
||||
for query in test_list:
|
||||
print(query)
|
||||
print('\n')
|
||||
print('test bigmodel...')
|
||||
start_time = time.time()
|
||||
print(run_v4_sync(query))
|
||||
end_time = time.time()
|
||||
print(f"bigmodel time: {end_time - start_time}")
|
||||
print('\n')
|
||||
print('*' * 25)
|
31
core/connects/exa_search.py
Normal file
31
core/connects/exa_search.py
Normal file
@ -0,0 +1,31 @@
|
||||
import httpx
|
||||
|
||||
headers = {
|
||||
"x-api-key": "",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
def search_with_exa(query: str) -> str:
|
||||
url = "https://api.exa.ai/search"
|
||||
|
||||
payload = {
|
||||
"query": query,
|
||||
"useAutoprompt": True,
|
||||
"type": "auto",
|
||||
"category": "news",
|
||||
"numResults": 5,
|
||||
"startCrawlDate": "2024-12-01T00:00:00.000Z",
|
||||
"endCrawlDate": "2025-01-21T00:00:00.000Z",
|
||||
"startPublishedDate": "2024-12-01T00:00:00.000Z",
|
||||
"endPublishedDate": "2025-01-21T00:00:00.000Z",
|
||||
"contents": {
|
||||
"text": {
|
||||
"maxCharacters": 1000,
|
||||
"includeHtmlTags": False
|
||||
},
|
||||
"livecrawl": "always",
|
||||
}
|
||||
}
|
||||
|
||||
response = httpx.post(url, json=payload, headers=headers, timeout=30)
|
||||
return response.text
|
21
core/connects/rss_connect.py
Normal file
21
core/connects/rss_connect.py
Normal file
@ -0,0 +1,21 @@
|
||||
import feedparser
|
||||
from loguru import logger
|
||||
import os, sys
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
|
||||
sys.path.append(core_path)
|
||||
|
||||
from utils.general_utils import isURL
|
||||
|
||||
|
||||
def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]:
|
||||
try:
|
||||
feed = feedparser.parse(rss_url)
|
||||
except Exception as e:
|
||||
if _logger:
|
||||
_logger.warning(f"RSS feed is not valid: {e}")
|
||||
return []
|
||||
|
||||
return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls]
|
@ -5,4 +5,5 @@ pydantic
|
||||
#json_repair==0.*
|
||||
beautifulsoup4
|
||||
requests
|
||||
crawl4ai==0.4.247
|
||||
crawl4ai==0.4.247
|
||||
feedparser==6.0.11
|
41
pb/pb_migrations/1737360418_updated_sites.js
Normal file
41
pb/pb_migrations/1737360418_updated_sites.js
Normal file
@ -0,0 +1,41 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("number1152796692")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("bool806155165")
|
||||
|
||||
return app.save(collection)
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(2, new Field({
|
||||
"hidden": false,
|
||||
"id": "number1152796692",
|
||||
"max": null,
|
||||
"min": null,
|
||||
"name": "per_hours",
|
||||
"onlyInt": false,
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "number"
|
||||
}))
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(3, new Field({
|
||||
"hidden": false,
|
||||
"id": "bool806155165",
|
||||
"name": "activated",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "bool"
|
||||
}))
|
||||
|
||||
return app.save(collection)
|
||||
})
|
29
pb/pb_migrations/1737360470_updated_sites.js
Normal file
29
pb/pb_migrations/1737360470_updated_sites.js
Normal file
@ -0,0 +1,29 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(2, new Field({
|
||||
"hidden": false,
|
||||
"id": "select2363381545",
|
||||
"maxSelect": 1,
|
||||
"name": "type",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "select",
|
||||
"values": [
|
||||
"web",
|
||||
"rss"
|
||||
]
|
||||
}))
|
||||
|
||||
return app.save(collection)
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("select2363381545")
|
||||
|
||||
return app.save(collection)
|
||||
})
|
24
pb/pb_migrations/1737360517_updated_focus_points.js
Normal file
24
pb/pb_migrations/1737360517_updated_focus_points.js
Normal file
@ -0,0 +1,24 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_3385864241")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("bool806155165")
|
||||
|
||||
return app.save(collection)
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_3385864241")
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(3, new Field({
|
||||
"hidden": false,
|
||||
"id": "bool806155165",
|
||||
"name": "activated",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "bool"
|
||||
}))
|
||||
|
||||
return app.save(collection)
|
||||
})
|
78
pb/pb_migrations/1737360603_created_task.js
Normal file
78
pb/pb_migrations/1737360603_created_task.js
Normal file
@ -0,0 +1,78 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = new Collection({
|
||||
"createRule": null,
|
||||
"deleteRule": null,
|
||||
"fields": [
|
||||
{
|
||||
"autogeneratePattern": "[a-z0-9]{15}",
|
||||
"hidden": false,
|
||||
"id": "text3208210256",
|
||||
"max": 15,
|
||||
"min": 15,
|
||||
"name": "id",
|
||||
"pattern": "^[a-z0-9]+$",
|
||||
"presentable": false,
|
||||
"primaryKey": true,
|
||||
"required": true,
|
||||
"system": true,
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"hidden": false,
|
||||
"id": "bool806155165",
|
||||
"name": "activated",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "bool"
|
||||
},
|
||||
{
|
||||
"hidden": false,
|
||||
"id": "number3171882809",
|
||||
"max": null,
|
||||
"min": null,
|
||||
"name": "per_hour",
|
||||
"onlyInt": false,
|
||||
"presentable": false,
|
||||
"required": true,
|
||||
"system": false,
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"hidden": false,
|
||||
"id": "autodate2990389176",
|
||||
"name": "created",
|
||||
"onCreate": true,
|
||||
"onUpdate": false,
|
||||
"presentable": false,
|
||||
"system": false,
|
||||
"type": "autodate"
|
||||
},
|
||||
{
|
||||
"hidden": false,
|
||||
"id": "autodate3332085495",
|
||||
"name": "updated",
|
||||
"onCreate": true,
|
||||
"onUpdate": true,
|
||||
"presentable": false,
|
||||
"system": false,
|
||||
"type": "autodate"
|
||||
}
|
||||
],
|
||||
"id": "pbc_1970519189",
|
||||
"indexes": [],
|
||||
"listRule": null,
|
||||
"name": "task",
|
||||
"system": false,
|
||||
"type": "base",
|
||||
"updateRule": null,
|
||||
"viewRule": null
|
||||
});
|
||||
|
||||
return app.save(collection);
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_1970519189");
|
||||
|
||||
return app.delete(collection);
|
||||
})
|
65
pb/pb_migrations/1737360712_updated_task.js
Normal file
65
pb/pb_migrations/1737360712_updated_task.js
Normal file
@ -0,0 +1,65 @@
|
||||
/// <reference path="../pb_data/types.d.ts" />
|
||||
migrate((app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_1970519189")
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(3, new Field({
|
||||
"cascadeDelete": false,
|
||||
"collectionId": "pbc_3385864241",
|
||||
"hidden": false,
|
||||
"id": "relation2655548471",
|
||||
"maxSelect": 999,
|
||||
"minSelect": 0,
|
||||
"name": "focus_points",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "relation"
|
||||
}))
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(4, new Field({
|
||||
"cascadeDelete": false,
|
||||
"collectionId": "pbc_2001081480",
|
||||
"hidden": false,
|
||||
"id": "relation3154160227",
|
||||
"maxSelect": 999,
|
||||
"minSelect": 0,
|
||||
"name": "sites",
|
||||
"presentable": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "relation"
|
||||
}))
|
||||
|
||||
// add field
|
||||
collection.fields.addAt(5, new Field({
|
||||
"autogeneratePattern": "",
|
||||
"hidden": false,
|
||||
"id": "text2870082381",
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
"name": "search_engine_keywords",
|
||||
"pattern": "",
|
||||
"presentable": false,
|
||||
"primaryKey": false,
|
||||
"required": false,
|
||||
"system": false,
|
||||
"type": "text"
|
||||
}))
|
||||
|
||||
return app.save(collection)
|
||||
}, (app) => {
|
||||
const collection = app.findCollectionByNameOrId("pbc_1970519189")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("relation2655548471")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("relation3154160227")
|
||||
|
||||
// remove field
|
||||
collection.fields.removeById("text2870082381")
|
||||
|
||||
return app.save(collection)
|
||||
})
|
@ -10,6 +10,9 @@ sys.path.append(core_path)
|
||||
from scrapers import *
|
||||
from agents.get_info import pre_process
|
||||
|
||||
|
||||
save_dir = 'webpage_samples'
|
||||
|
||||
def check_url_text(text):
|
||||
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
|
||||
print(f"processing: {text}")
|
||||
@ -118,7 +121,7 @@ if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--test_file', '-F', type=str, default='')
|
||||
parser.add_argument('--sample_dir', '-D', type=str, default='')
|
||||
parser.add_argument('--record_folder', '-R', type=str, default='')
|
||||
parser.add_argument('--record_folder', '-R', type=str, default=save_dir)
|
||||
args = parser.parse_args()
|
||||
|
||||
test_file = args.test_file
|
||||
|
Loading…
Reference in New Issue
Block a user