mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
rss and search study
This commit is contained in:
parent
7ac3b6f23e
commit
50332b1a09
@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
|||||||
if ratio > 0.05:
|
if ratio > 0.05:
|
||||||
if test_mode:
|
if test_mode:
|
||||||
print('this is a navigation section, will be removed')
|
print('this is a navigation section, will be removed')
|
||||||
print(ratio)
|
print(ratio, '\n')
|
||||||
print(section_remain)
|
print(section_remain)
|
||||||
print('-' * 50)
|
print('-' * 50)
|
||||||
sections = sections[1:]
|
sections = sections[1:]
|
||||||
@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
|||||||
section_remain_len = len(section_remain)
|
section_remain_len = len(section_remain)
|
||||||
if section_remain_len < 198:
|
if section_remain_len < 198:
|
||||||
if test_mode:
|
if test_mode:
|
||||||
print('this is a footer section, will be removed')
|
print('this is a footer section, will be removed\n')
|
||||||
print(section_remain_len)
|
print(section_remain_len)
|
||||||
print(section_remain)
|
print(section_remain)
|
||||||
print('-' * 50)
|
print('-' * 50)
|
||||||
@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
|
|||||||
if ratio < 70:
|
if ratio < 70:
|
||||||
if test_mode:
|
if test_mode:
|
||||||
print('this is a links part')
|
print('this is a links part')
|
||||||
print(ratio)
|
print(ratio, '\n')
|
||||||
print(text)
|
print(text)
|
||||||
print('-' * 50)
|
print('-' * 50)
|
||||||
links_parts.append(text)
|
links_parts.append(text)
|
||||||
else:
|
else:
|
||||||
if test_mode:
|
if test_mode:
|
||||||
print('this is a content part')
|
print('this is a content part')
|
||||||
print(ratio)
|
print(ratio, '\n')
|
||||||
print(text)
|
print(text)
|
||||||
print('-' * 50)
|
print('-' * 50)
|
||||||
contents.append(text)
|
contents.append(text)
|
||||||
|
54
core/connects/__init__.py
Normal file
54
core/connects/__init__.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from exa_search import search_with_exa
|
||||||
|
import time
|
||||||
|
from pprint import pprint
|
||||||
|
import requests
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
api_key = ''
|
||||||
|
|
||||||
|
def run_v4_sync(query: str):
|
||||||
|
msg = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": query
|
||||||
|
}
|
||||||
|
]
|
||||||
|
tool = "web-search-pro"
|
||||||
|
url = "https://open.bigmodel.cn/api/paas/v4/tools"
|
||||||
|
request_id = str(uuid.uuid4())
|
||||||
|
data = {
|
||||||
|
"request_id": request_id,
|
||||||
|
"tool": tool,
|
||||||
|
"stream": False,
|
||||||
|
"messages": msg
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
url,
|
||||||
|
json=data,
|
||||||
|
headers={'Authorization': api_key},
|
||||||
|
timeout=300
|
||||||
|
)
|
||||||
|
result = resp.json()
|
||||||
|
return result['choices'][0]['message']
|
||||||
|
|
||||||
|
|
||||||
|
test_list = ['广东全省的台风预警——仅限2024年的信息',
|
||||||
|
'大模型技术突破与创新——包括新算法与模型,新的研究成果',
|
||||||
|
'事件图谱方面的知识',
|
||||||
|
'人工智能领军人物介绍',
|
||||||
|
'社区治理',
|
||||||
|
'新获批的氢能项目——60万吨级别以上',
|
||||||
|
'氢能项目招标信息——2024年12月以后',
|
||||||
|
'各地住宅网签最新数据——2025年1月6日以后']
|
||||||
|
|
||||||
|
for query in test_list:
|
||||||
|
print(query)
|
||||||
|
print('\n')
|
||||||
|
print('test bigmodel...')
|
||||||
|
start_time = time.time()
|
||||||
|
print(run_v4_sync(query))
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"bigmodel time: {end_time - start_time}")
|
||||||
|
print('\n')
|
||||||
|
print('*' * 25)
|
31
core/connects/exa_search.py
Normal file
31
core/connects/exa_search.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import httpx
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-api-key": "",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
def search_with_exa(query: str) -> str:
|
||||||
|
url = "https://api.exa.ai/search"
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"query": query,
|
||||||
|
"useAutoprompt": True,
|
||||||
|
"type": "auto",
|
||||||
|
"category": "news",
|
||||||
|
"numResults": 5,
|
||||||
|
"startCrawlDate": "2024-12-01T00:00:00.000Z",
|
||||||
|
"endCrawlDate": "2025-01-21T00:00:00.000Z",
|
||||||
|
"startPublishedDate": "2024-12-01T00:00:00.000Z",
|
||||||
|
"endPublishedDate": "2025-01-21T00:00:00.000Z",
|
||||||
|
"contents": {
|
||||||
|
"text": {
|
||||||
|
"maxCharacters": 1000,
|
||||||
|
"includeHtmlTags": False
|
||||||
|
},
|
||||||
|
"livecrawl": "always",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = httpx.post(url, json=payload, headers=headers, timeout=30)
|
||||||
|
return response.text
|
21
core/connects/rss_connect.py
Normal file
21
core/connects/rss_connect.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import feedparser
|
||||||
|
from loguru import logger
|
||||||
|
import os, sys
|
||||||
|
import json
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
|
||||||
|
sys.path.append(core_path)
|
||||||
|
|
||||||
|
from utils.general_utils import isURL
|
||||||
|
|
||||||
|
|
||||||
|
def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]:
|
||||||
|
try:
|
||||||
|
feed = feedparser.parse(rss_url)
|
||||||
|
except Exception as e:
|
||||||
|
if _logger:
|
||||||
|
_logger.warning(f"RSS feed is not valid: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls]
|
@ -6,3 +6,4 @@ pydantic
|
|||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
requests
|
requests
|
||||||
crawl4ai==0.4.247
|
crawl4ai==0.4.247
|
||||||
|
feedparser==6.0.11
|
41
pb/pb_migrations/1737360418_updated_sites.js
Normal file
41
pb/pb_migrations/1737360418_updated_sites.js
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("number1152796692")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("bool806155165")
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(2, new Field({
|
||||||
|
"hidden": false,
|
||||||
|
"id": "number1152796692",
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"name": "per_hours",
|
||||||
|
"onlyInt": false,
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "number"
|
||||||
|
}))
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(3, new Field({
|
||||||
|
"hidden": false,
|
||||||
|
"id": "bool806155165",
|
||||||
|
"name": "activated",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "bool"
|
||||||
|
}))
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
})
|
29
pb/pb_migrations/1737360470_updated_sites.js
Normal file
29
pb/pb_migrations/1737360470_updated_sites.js
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(2, new Field({
|
||||||
|
"hidden": false,
|
||||||
|
"id": "select2363381545",
|
||||||
|
"maxSelect": 1,
|
||||||
|
"name": "type",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "select",
|
||||||
|
"values": [
|
||||||
|
"web",
|
||||||
|
"rss"
|
||||||
|
]
|
||||||
|
}))
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_2001081480")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("select2363381545")
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
})
|
24
pb/pb_migrations/1737360517_updated_focus_points.js
Normal file
24
pb/pb_migrations/1737360517_updated_focus_points.js
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_3385864241")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("bool806155165")
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_3385864241")
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(3, new Field({
|
||||||
|
"hidden": false,
|
||||||
|
"id": "bool806155165",
|
||||||
|
"name": "activated",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "bool"
|
||||||
|
}))
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
})
|
78
pb/pb_migrations/1737360603_created_task.js
Normal file
78
pb/pb_migrations/1737360603_created_task.js
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = new Collection({
|
||||||
|
"createRule": null,
|
||||||
|
"deleteRule": null,
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"autogeneratePattern": "[a-z0-9]{15}",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "text3208210256",
|
||||||
|
"max": 15,
|
||||||
|
"min": 15,
|
||||||
|
"name": "id",
|
||||||
|
"pattern": "^[a-z0-9]+$",
|
||||||
|
"presentable": false,
|
||||||
|
"primaryKey": true,
|
||||||
|
"required": true,
|
||||||
|
"system": true,
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "bool806155165",
|
||||||
|
"name": "activated",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "bool"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "number3171882809",
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"name": "per_hour",
|
||||||
|
"onlyInt": false,
|
||||||
|
"presentable": false,
|
||||||
|
"required": true,
|
||||||
|
"system": false,
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate2990389176",
|
||||||
|
"name": "created",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": false,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hidden": false,
|
||||||
|
"id": "autodate3332085495",
|
||||||
|
"name": "updated",
|
||||||
|
"onCreate": true,
|
||||||
|
"onUpdate": true,
|
||||||
|
"presentable": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "autodate"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"id": "pbc_1970519189",
|
||||||
|
"indexes": [],
|
||||||
|
"listRule": null,
|
||||||
|
"name": "task",
|
||||||
|
"system": false,
|
||||||
|
"type": "base",
|
||||||
|
"updateRule": null,
|
||||||
|
"viewRule": null
|
||||||
|
});
|
||||||
|
|
||||||
|
return app.save(collection);
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_1970519189");
|
||||||
|
|
||||||
|
return app.delete(collection);
|
||||||
|
})
|
65
pb/pb_migrations/1737360712_updated_task.js
Normal file
65
pb/pb_migrations/1737360712_updated_task.js
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
/// <reference path="../pb_data/types.d.ts" />
|
||||||
|
migrate((app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_1970519189")
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(3, new Field({
|
||||||
|
"cascadeDelete": false,
|
||||||
|
"collectionId": "pbc_3385864241",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "relation2655548471",
|
||||||
|
"maxSelect": 999,
|
||||||
|
"minSelect": 0,
|
||||||
|
"name": "focus_points",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "relation"
|
||||||
|
}))
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(4, new Field({
|
||||||
|
"cascadeDelete": false,
|
||||||
|
"collectionId": "pbc_2001081480",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "relation3154160227",
|
||||||
|
"maxSelect": 999,
|
||||||
|
"minSelect": 0,
|
||||||
|
"name": "sites",
|
||||||
|
"presentable": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "relation"
|
||||||
|
}))
|
||||||
|
|
||||||
|
// add field
|
||||||
|
collection.fields.addAt(5, new Field({
|
||||||
|
"autogeneratePattern": "",
|
||||||
|
"hidden": false,
|
||||||
|
"id": "text2870082381",
|
||||||
|
"max": 0,
|
||||||
|
"min": 0,
|
||||||
|
"name": "search_engine_keywords",
|
||||||
|
"pattern": "",
|
||||||
|
"presentable": false,
|
||||||
|
"primaryKey": false,
|
||||||
|
"required": false,
|
||||||
|
"system": false,
|
||||||
|
"type": "text"
|
||||||
|
}))
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
}, (app) => {
|
||||||
|
const collection = app.findCollectionByNameOrId("pbc_1970519189")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("relation2655548471")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("relation3154160227")
|
||||||
|
|
||||||
|
// remove field
|
||||||
|
collection.fields.removeById("text2870082381")
|
||||||
|
|
||||||
|
return app.save(collection)
|
||||||
|
})
|
@ -10,6 +10,9 @@ sys.path.append(core_path)
|
|||||||
from scrapers import *
|
from scrapers import *
|
||||||
from agents.get_info import pre_process
|
from agents.get_info import pre_process
|
||||||
|
|
||||||
|
|
||||||
|
save_dir = 'webpage_samples'
|
||||||
|
|
||||||
def check_url_text(text):
|
def check_url_text(text):
|
||||||
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
|
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
|
||||||
print(f"processing: {text}")
|
print(f"processing: {text}")
|
||||||
@ -118,7 +121,7 @@ if __name__ == '__main__':
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--test_file', '-F', type=str, default='')
|
parser.add_argument('--test_file', '-F', type=str, default='')
|
||||||
parser.add_argument('--sample_dir', '-D', type=str, default='')
|
parser.add_argument('--sample_dir', '-D', type=str, default='')
|
||||||
parser.add_argument('--record_folder', '-R', type=str, default='')
|
parser.add_argument('--record_folder', '-R', type=str, default=save_dir)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
test_file = args.test_file
|
test_file = args.test_file
|
||||||
|
Loading…
Reference in New Issue
Block a user