rss and search study

This commit is contained in:
bigbrother666sh 2025-01-21 22:46:45 +08:00
parent 7ac3b6f23e
commit 50332b1a09
11 changed files with 353 additions and 6 deletions

View File

@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
if ratio > 0.05:
if test_mode:
print('this is a navigation section, will be removed')
print(ratio)
print(ratio, '\n')
print(section_remain)
print('-' * 50)
sections = sections[1:]
@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
section_remain_len = len(section_remain)
if section_remain_len < 198:
if test_mode:
print('this is a footer section, will be removed')
print('this is a footer section, will be removed\n')
print(section_remain_len)
print(section_remain)
print('-' * 50)
@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
if ratio < 70:
if test_mode:
print('this is a links part')
print(ratio)
print(ratio, '\n')
print(text)
print('-' * 50)
links_parts.append(text)
else:
if test_mode:
print('this is a content part')
print(ratio)
print(ratio, '\n')
print(text)
print('-' * 50)
contents.append(text)

54
core/connects/__init__.py Normal file
View File

@ -0,0 +1,54 @@
from exa_search import search_with_exa
import time
from pprint import pprint
import requests
import uuid
api_key = ''
def run_v4_sync(query: str):
msg = [
{
"role": "user",
"content": query
}
]
tool = "web-search-pro"
url = "https://open.bigmodel.cn/api/paas/v4/tools"
request_id = str(uuid.uuid4())
data = {
"request_id": request_id,
"tool": tool,
"stream": False,
"messages": msg
}
resp = requests.post(
url,
json=data,
headers={'Authorization': api_key},
timeout=300
)
result = resp.json()
return result['choices'][0]['message']
test_list = ['广东全省的台风预警——仅限2024年的信息',
'大模型技术突破与创新——包括新算法与模型,新的研究成果',
'事件图谱方面的知识',
'人工智能领军人物介绍',
'社区治理',
'新获批的氢能项目——60万吨级别以上',
'氢能项目招标信息——2024年12月以后',
'各地住宅网签最新数据——2025年1月6日以后']
for query in test_list:
print(query)
print('\n')
print('test bigmodel...')
start_time = time.time()
print(run_v4_sync(query))
end_time = time.time()
print(f"bigmodel time: {end_time - start_time}")
print('\n')
print('*' * 25)

View File

@ -0,0 +1,31 @@
import httpx
headers = {
"x-api-key": "",
"Content-Type": "application/json"
}
def search_with_exa(query: str) -> str:
url = "https://api.exa.ai/search"
payload = {
"query": query,
"useAutoprompt": True,
"type": "auto",
"category": "news",
"numResults": 5,
"startCrawlDate": "2024-12-01T00:00:00.000Z",
"endCrawlDate": "2025-01-21T00:00:00.000Z",
"startPublishedDate": "2024-12-01T00:00:00.000Z",
"endPublishedDate": "2025-01-21T00:00:00.000Z",
"contents": {
"text": {
"maxCharacters": 1000,
"includeHtmlTags": False
},
"livecrawl": "always",
}
}
response = httpx.post(url, json=payload, headers=headers, timeout=30)
return response.text

View File

@ -0,0 +1,21 @@
import feedparser
from loguru import logger
import os, sys
import json
from urllib.parse import urlparse
core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
sys.path.append(core_path)
from utils.general_utils import isURL
def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]:
try:
feed = feedparser.parse(rss_url)
except Exception as e:
if _logger:
_logger.warning(f"RSS feed is not valid: {e}")
return []
return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls]

View File

@ -5,4 +5,5 @@ pydantic
#json_repair==0.*
beautifulsoup4
requests
crawl4ai==0.4.247
crawl4ai==0.4.247
feedparser==6.0.11

View File

@ -0,0 +1,41 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = app.findCollectionByNameOrId("pbc_2001081480")
// remove field
collection.fields.removeById("number1152796692")
// remove field
collection.fields.removeById("bool806155165")
return app.save(collection)
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_2001081480")
// add field
collection.fields.addAt(2, new Field({
"hidden": false,
"id": "number1152796692",
"max": null,
"min": null,
"name": "per_hours",
"onlyInt": false,
"presentable": false,
"required": false,
"system": false,
"type": "number"
}))
// add field
collection.fields.addAt(3, new Field({
"hidden": false,
"id": "bool806155165",
"name": "activated",
"presentable": false,
"required": false,
"system": false,
"type": "bool"
}))
return app.save(collection)
})

View File

@ -0,0 +1,29 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = app.findCollectionByNameOrId("pbc_2001081480")
// add field
collection.fields.addAt(2, new Field({
"hidden": false,
"id": "select2363381545",
"maxSelect": 1,
"name": "type",
"presentable": false,
"required": false,
"system": false,
"type": "select",
"values": [
"web",
"rss"
]
}))
return app.save(collection)
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_2001081480")
// remove field
collection.fields.removeById("select2363381545")
return app.save(collection)
})

View File

@ -0,0 +1,24 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = app.findCollectionByNameOrId("pbc_3385864241")
// remove field
collection.fields.removeById("bool806155165")
return app.save(collection)
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_3385864241")
// add field
collection.fields.addAt(3, new Field({
"hidden": false,
"id": "bool806155165",
"name": "activated",
"presentable": false,
"required": false,
"system": false,
"type": "bool"
}))
return app.save(collection)
})

View File

@ -0,0 +1,78 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = new Collection({
"createRule": null,
"deleteRule": null,
"fields": [
{
"autogeneratePattern": "[a-z0-9]{15}",
"hidden": false,
"id": "text3208210256",
"max": 15,
"min": 15,
"name": "id",
"pattern": "^[a-z0-9]+$",
"presentable": false,
"primaryKey": true,
"required": true,
"system": true,
"type": "text"
},
{
"hidden": false,
"id": "bool806155165",
"name": "activated",
"presentable": false,
"required": false,
"system": false,
"type": "bool"
},
{
"hidden": false,
"id": "number3171882809",
"max": null,
"min": null,
"name": "per_hour",
"onlyInt": false,
"presentable": false,
"required": true,
"system": false,
"type": "number"
},
{
"hidden": false,
"id": "autodate2990389176",
"name": "created",
"onCreate": true,
"onUpdate": false,
"presentable": false,
"system": false,
"type": "autodate"
},
{
"hidden": false,
"id": "autodate3332085495",
"name": "updated",
"onCreate": true,
"onUpdate": true,
"presentable": false,
"system": false,
"type": "autodate"
}
],
"id": "pbc_1970519189",
"indexes": [],
"listRule": null,
"name": "task",
"system": false,
"type": "base",
"updateRule": null,
"viewRule": null
});
return app.save(collection);
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_1970519189");
return app.delete(collection);
})

View File

@ -0,0 +1,65 @@
/// <reference path="../pb_data/types.d.ts" />
migrate((app) => {
const collection = app.findCollectionByNameOrId("pbc_1970519189")
// add field
collection.fields.addAt(3, new Field({
"cascadeDelete": false,
"collectionId": "pbc_3385864241",
"hidden": false,
"id": "relation2655548471",
"maxSelect": 999,
"minSelect": 0,
"name": "focus_points",
"presentable": false,
"required": false,
"system": false,
"type": "relation"
}))
// add field
collection.fields.addAt(4, new Field({
"cascadeDelete": false,
"collectionId": "pbc_2001081480",
"hidden": false,
"id": "relation3154160227",
"maxSelect": 999,
"minSelect": 0,
"name": "sites",
"presentable": false,
"required": false,
"system": false,
"type": "relation"
}))
// add field
collection.fields.addAt(5, new Field({
"autogeneratePattern": "",
"hidden": false,
"id": "text2870082381",
"max": 0,
"min": 0,
"name": "search_engine_keywords",
"pattern": "",
"presentable": false,
"primaryKey": false,
"required": false,
"system": false,
"type": "text"
}))
return app.save(collection)
}, (app) => {
const collection = app.findCollectionByNameOrId("pbc_1970519189")
// remove field
collection.fields.removeById("relation2655548471")
// remove field
collection.fields.removeById("relation3154160227")
// remove field
collection.fields.removeById("text2870082381")
return app.save(collection)
})

View File

@ -10,6 +10,9 @@ sys.path.append(core_path)
from scrapers import *
from agents.get_info import pre_process
save_dir = 'webpage_samples'
def check_url_text(text):
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
print(f"processing: {text}")
@ -118,7 +121,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--test_file', '-F', type=str, default='')
parser.add_argument('--sample_dir', '-D', type=str, default='')
parser.add_argument('--record_folder', '-R', type=str, default='')
parser.add_argument('--record_folder', '-R', type=str, default=save_dir)
args = parser.parse_args()
test_file = args.test_file