From 50332b1a093036c3cffafad0da7990237a942cd4 Mon Sep 17 00:00:00 2001 From: bigbrother666sh Date: Tue, 21 Jan 2025 22:46:45 +0800 Subject: [PATCH] rss and search study --- core/agents/get_info.py | 8 +- core/connects/__init__.py | 54 +++++++++++++ core/connects/exa_search.py | 31 ++++++++ core/connects/rss_connect.py | 21 +++++ core/requirements.txt | 3 +- pb/pb_migrations/1737360418_updated_sites.js | 41 ++++++++++ pb/pb_migrations/1737360470_updated_sites.js | 29 +++++++ .../1737360517_updated_focus_points.js | 24 ++++++ pb/pb_migrations/1737360603_created_task.js | 78 +++++++++++++++++++ pb/pb_migrations/1737360712_updated_task.js | 65 ++++++++++++++++ test/pre_process_test.py | 5 +- 11 files changed, 353 insertions(+), 6 deletions(-) create mode 100644 core/connects/__init__.py create mode 100644 core/connects/exa_search.py create mode 100644 core/connects/rss_connect.py create mode 100644 pb/pb_migrations/1737360418_updated_sites.js create mode 100644 pb/pb_migrations/1737360470_updated_sites.js create mode 100644 pb/pb_migrations/1737360517_updated_focus_points.js create mode 100644 pb/pb_migrations/1737360603_created_task.js create mode 100644 pb/pb_migrations/1737360712_updated_task.js diff --git a/core/agents/get_info.py b/core/agents/get_info.py index 1f5e033..53498b6 100644 --- a/core/agents/get_info.py +++ b/core/agents/get_info.py @@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str], if ratio > 0.05: if test_mode: print('this is a navigation section, will be removed') - print(ratio) + print(ratio, '\n') print(section_remain) print('-' * 50) sections = sections[1:] @@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str], section_remain_len = len(section_remain) if section_remain_len < 198: if test_mode: - print('this is a footer section, will be removed') + print('this is a footer section, will be removed\n') print(section_remain_len) print(section_remain) print('-' * 50) @@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str], if ratio < 70: if test_mode: print('this is a links part') - print(ratio) + print(ratio, '\n') print(text) print('-' * 50) links_parts.append(text) else: if test_mode: print('this is a content part') - print(ratio) + print(ratio, '\n') print(text) print('-' * 50) contents.append(text) diff --git a/core/connects/__init__.py b/core/connects/__init__.py new file mode 100644 index 0000000..01b2024 --- /dev/null +++ b/core/connects/__init__.py @@ -0,0 +1,54 @@ +from exa_search import search_with_exa +import time +from pprint import pprint +import requests +import uuid + +api_key = '' + +def run_v4_sync(query: str): + msg = [ + { + "role": "user", + "content": query + } + ] + tool = "web-search-pro" + url = "https://open.bigmodel.cn/api/paas/v4/tools" + request_id = str(uuid.uuid4()) + data = { + "request_id": request_id, + "tool": tool, + "stream": False, + "messages": msg + } + + resp = requests.post( + url, + json=data, + headers={'Authorization': api_key}, + timeout=300 + ) + result = resp.json() + return result['choices'][0]['message'] + + +test_list = ['广东全省的台风预警——仅限2024年的信息', + '大模型技术突破与创新——包括新算法与模型,新的研究成果', + '事件图谱方面的知识', + '人工智能领军人物介绍', + '社区治理', + '新获批的氢能项目——60万吨级别以上', + '氢能项目招标信息——2024年12月以后', + '各地住宅网签最新数据——2025年1月6日以后'] + +for query in test_list: + print(query) + print('\n') + print('test bigmodel...') + start_time = time.time() + print(run_v4_sync(query)) + end_time = time.time() + print(f"bigmodel time: {end_time - start_time}") + print('\n') + print('*' * 25) \ No newline at end of file diff --git a/core/connects/exa_search.py b/core/connects/exa_search.py new file mode 100644 index 0000000..b2c218d --- /dev/null +++ b/core/connects/exa_search.py @@ -0,0 +1,31 @@ +import httpx + +headers = { + "x-api-key": "", + "Content-Type": "application/json" +} + +def search_with_exa(query: str) -> str: + url = "https://api.exa.ai/search" + + payload = { + "query": query, + "useAutoprompt": True, + "type": "auto", + "category": "news", + "numResults": 5, + "startCrawlDate": "2024-12-01T00:00:00.000Z", + "endCrawlDate": "2025-01-21T00:00:00.000Z", + "startPublishedDate": "2024-12-01T00:00:00.000Z", + "endPublishedDate": "2025-01-21T00:00:00.000Z", + "contents": { + "text": { + "maxCharacters": 1000, + "includeHtmlTags": False + }, + "livecrawl": "always", + } + } + + response = httpx.post(url, json=payload, headers=headers, timeout=30) + return response.text \ No newline at end of file diff --git a/core/connects/rss_connect.py b/core/connects/rss_connect.py new file mode 100644 index 0000000..ea6ab6c --- /dev/null +++ b/core/connects/rss_connect.py @@ -0,0 +1,21 @@ +import feedparser +from loguru import logger +import os, sys +import json +from urllib.parse import urlparse + +core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') +sys.path.append(core_path) + +from utils.general_utils import isURL + + +def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]: + try: + feed = feedparser.parse(rss_url) + except Exception as e: + if _logger: + _logger.warning(f"RSS feed is not valid: {e}") + return [] + + return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls] diff --git a/core/requirements.txt b/core/requirements.txt index 9523478..2a2b933 100644 --- a/core/requirements.txt +++ b/core/requirements.txt @@ -5,4 +5,5 @@ pydantic #json_repair==0.* beautifulsoup4 requests -crawl4ai==0.4.247 \ No newline at end of file +crawl4ai==0.4.247 +feedparser==6.0.11 \ No newline at end of file diff --git a/pb/pb_migrations/1737360418_updated_sites.js b/pb/pb_migrations/1737360418_updated_sites.js new file mode 100644 index 0000000..8bbaa6f --- /dev/null +++ b/pb/pb_migrations/1737360418_updated_sites.js @@ -0,0 +1,41 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_2001081480") + + // remove field + collection.fields.removeById("number1152796692") + + // remove field + collection.fields.removeById("bool806155165") + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_2001081480") + + // add field + collection.fields.addAt(2, new Field({ + "hidden": false, + "id": "number1152796692", + "max": null, + "min": null, + "name": "per_hours", + "onlyInt": false, + "presentable": false, + "required": false, + "system": false, + "type": "number" + })) + + // add field + collection.fields.addAt(3, new Field({ + "hidden": false, + "id": "bool806155165", + "name": "activated", + "presentable": false, + "required": false, + "system": false, + "type": "bool" + })) + + return app.save(collection) +}) diff --git a/pb/pb_migrations/1737360470_updated_sites.js b/pb/pb_migrations/1737360470_updated_sites.js new file mode 100644 index 0000000..3a6707e --- /dev/null +++ b/pb/pb_migrations/1737360470_updated_sites.js @@ -0,0 +1,29 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_2001081480") + + // add field + collection.fields.addAt(2, new Field({ + "hidden": false, + "id": "select2363381545", + "maxSelect": 1, + "name": "type", + "presentable": false, + "required": false, + "system": false, + "type": "select", + "values": [ + "web", + "rss" + ] + })) + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_2001081480") + + // remove field + collection.fields.removeById("select2363381545") + + return app.save(collection) +}) diff --git a/pb/pb_migrations/1737360517_updated_focus_points.js b/pb/pb_migrations/1737360517_updated_focus_points.js new file mode 100644 index 0000000..d7581a1 --- /dev/null +++ b/pb/pb_migrations/1737360517_updated_focus_points.js @@ -0,0 +1,24 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // remove field + collection.fields.removeById("bool806155165") + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // add field + collection.fields.addAt(3, new Field({ + "hidden": false, + "id": "bool806155165", + "name": "activated", + "presentable": false, + "required": false, + "system": false, + "type": "bool" + })) + + return app.save(collection) +}) diff --git a/pb/pb_migrations/1737360603_created_task.js b/pb/pb_migrations/1737360603_created_task.js new file mode 100644 index 0000000..0f27d7d --- /dev/null +++ b/pb/pb_migrations/1737360603_created_task.js @@ -0,0 +1,78 @@ +/// +migrate((app) => { + const collection = new Collection({ + "createRule": null, + "deleteRule": null, + "fields": [ + { + "autogeneratePattern": "[a-z0-9]{15}", + "hidden": false, + "id": "text3208210256", + "max": 15, + "min": 15, + "name": "id", + "pattern": "^[a-z0-9]+$", + "presentable": false, + "primaryKey": true, + "required": true, + "system": true, + "type": "text" + }, + { + "hidden": false, + "id": "bool806155165", + "name": "activated", + "presentable": false, + "required": false, + "system": false, + "type": "bool" + }, + { + "hidden": false, + "id": "number3171882809", + "max": null, + "min": null, + "name": "per_hour", + "onlyInt": false, + "presentable": false, + "required": true, + "system": false, + "type": "number" + }, + { + "hidden": false, + "id": "autodate2990389176", + "name": "created", + "onCreate": true, + "onUpdate": false, + "presentable": false, + "system": false, + "type": "autodate" + }, + { + "hidden": false, + "id": "autodate3332085495", + "name": "updated", + "onCreate": true, + "onUpdate": true, + "presentable": false, + "system": false, + "type": "autodate" + } + ], + "id": "pbc_1970519189", + "indexes": [], + "listRule": null, + "name": "task", + "system": false, + "type": "base", + "updateRule": null, + "viewRule": null + }); + + return app.save(collection); +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_1970519189"); + + return app.delete(collection); +}) diff --git a/pb/pb_migrations/1737360712_updated_task.js b/pb/pb_migrations/1737360712_updated_task.js new file mode 100644 index 0000000..084c86e --- /dev/null +++ b/pb/pb_migrations/1737360712_updated_task.js @@ -0,0 +1,65 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_1970519189") + + // add field + collection.fields.addAt(3, new Field({ + "cascadeDelete": false, + "collectionId": "pbc_3385864241", + "hidden": false, + "id": "relation2655548471", + "maxSelect": 999, + "minSelect": 0, + "name": "focus_points", + "presentable": false, + "required": false, + "system": false, + "type": "relation" + })) + + // add field + collection.fields.addAt(4, new Field({ + "cascadeDelete": false, + "collectionId": "pbc_2001081480", + "hidden": false, + "id": "relation3154160227", + "maxSelect": 999, + "minSelect": 0, + "name": "sites", + "presentable": false, + "required": false, + "system": false, + "type": "relation" + })) + + // add field + collection.fields.addAt(5, new Field({ + "autogeneratePattern": "", + "hidden": false, + "id": "text2870082381", + "max": 0, + "min": 0, + "name": "search_engine_keywords", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": false, + "system": false, + "type": "text" + })) + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_1970519189") + + // remove field + collection.fields.removeById("relation2655548471") + + // remove field + collection.fields.removeById("relation3154160227") + + // remove field + collection.fields.removeById("text2870082381") + + return app.save(collection) +}) diff --git a/test/pre_process_test.py b/test/pre_process_test.py index bf969f8..881a4b8 100644 --- a/test/pre_process_test.py +++ b/test/pre_process_test.py @@ -10,6 +10,9 @@ sys.path.append(core_path) from scrapers import * from agents.get_info import pre_process + +save_dir = 'webpage_samples' + def check_url_text(text): common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#' print(f"processing: {text}") @@ -118,7 +121,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--test_file', '-F', type=str, default='') parser.add_argument('--sample_dir', '-D', type=str, default='') - parser.add_argument('--record_folder', '-R', type=str, default='') + parser.add_argument('--record_folder', '-R', type=str, default=save_dir) args = parser.parse_args() test_file = args.test_file