diff --git a/core/agents/get_info.py b/core/agents/get_info.py
index 1f5e033..53498b6 100644
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
if ratio > 0.05:
if test_mode:
print('this is a navigation section, will be removed')
- print(ratio)
+ print(ratio, '\n')
print(section_remain)
print('-' * 50)
sections = sections[1:]
@@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
section_remain_len = len(section_remain)
if section_remain_len < 198:
if test_mode:
- print('this is a footer section, will be removed')
+ print('this is a footer section, will be removed\n')
print(section_remain_len)
print(section_remain)
print('-' * 50)
@@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
if ratio < 70:
if test_mode:
print('this is a links part')
- print(ratio)
+ print(ratio, '\n')
print(text)
print('-' * 50)
links_parts.append(text)
else:
if test_mode:
print('this is a content part')
- print(ratio)
+ print(ratio, '\n')
print(text)
print('-' * 50)
contents.append(text)
diff --git a/core/connects/__init__.py b/core/connects/__init__.py
new file mode 100644
index 0000000..01b2024
--- /dev/null
+++ b/core/connects/__init__.py
@@ -0,0 +1,54 @@
+from exa_search import search_with_exa
+import time
+from pprint import pprint
+import requests
+import uuid
+
+api_key = ''
+
+def run_v4_sync(query: str):
+ msg = [
+ {
+ "role": "user",
+ "content": query
+ }
+ ]
+ tool = "web-search-pro"
+ url = "https://open.bigmodel.cn/api/paas/v4/tools"
+ request_id = str(uuid.uuid4())
+ data = {
+ "request_id": request_id,
+ "tool": tool,
+ "stream": False,
+ "messages": msg
+ }
+
+ resp = requests.post(
+ url,
+ json=data,
+ headers={'Authorization': api_key},
+ timeout=300
+ )
+ result = resp.json()
+ return result['choices'][0]['message']
+
+
+test_list = ['广东全省的台风预警——仅限2024年的信息',
+ '大模型技术突破与创新——包括新算法与模型,新的研究成果',
+ '事件图谱方面的知识',
+ '人工智能领军人物介绍',
+ '社区治理',
+ '新获批的氢能项目——60万吨级别以上',
+ '氢能项目招标信息——2024年12月以后',
+ '各地住宅网签最新数据——2025年1月6日以后']
+
+for query in test_list:
+ print(query)
+ print('\n')
+ print('test bigmodel...')
+ start_time = time.time()
+ print(run_v4_sync(query))
+ end_time = time.time()
+ print(f"bigmodel time: {end_time - start_time}")
+ print('\n')
+ print('*' * 25)
\ No newline at end of file
diff --git a/core/connects/exa_search.py b/core/connects/exa_search.py
new file mode 100644
index 0000000..b2c218d
--- /dev/null
+++ b/core/connects/exa_search.py
@@ -0,0 +1,31 @@
+import httpx
+
+headers = {
+ "x-api-key": "",
+ "Content-Type": "application/json"
+}
+
+def search_with_exa(query: str) -> str:
+ url = "https://api.exa.ai/search"
+
+ payload = {
+ "query": query,
+ "useAutoprompt": True,
+ "type": "auto",
+ "category": "news",
+ "numResults": 5,
+ "startCrawlDate": "2024-12-01T00:00:00.000Z",
+ "endCrawlDate": "2025-01-21T00:00:00.000Z",
+ "startPublishedDate": "2024-12-01T00:00:00.000Z",
+ "endPublishedDate": "2025-01-21T00:00:00.000Z",
+ "contents": {
+ "text": {
+ "maxCharacters": 1000,
+ "includeHtmlTags": False
+ },
+ "livecrawl": "always",
+ }
+ }
+
+ response = httpx.post(url, json=payload, headers=headers, timeout=30)
+ return response.text
\ No newline at end of file
diff --git a/core/connects/rss_connect.py b/core/connects/rss_connect.py
new file mode 100644
index 0000000..ea6ab6c
--- /dev/null
+++ b/core/connects/rss_connect.py
@@ -0,0 +1,21 @@
+import feedparser
+from loguru import logger
+import os, sys
+import json
+from urllib.parse import urlparse
+
+core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
+sys.path.append(core_path)
+
+from utils.general_utils import isURL
+
+
+def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]:
+ try:
+ feed = feedparser.parse(rss_url)
+ except Exception as e:
+ if _logger:
+ _logger.warning(f"RSS feed is not valid: {e}")
+ return []
+
+ return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls]
diff --git a/core/requirements.txt b/core/requirements.txt
index 9523478..2a2b933 100644
--- a/core/requirements.txt
+++ b/core/requirements.txt
@@ -5,4 +5,5 @@ pydantic
#json_repair==0.*
beautifulsoup4
requests
-crawl4ai==0.4.247
\ No newline at end of file
+crawl4ai==0.4.247
+feedparser==6.0.11
\ No newline at end of file
diff --git a/pb/pb_migrations/1737360418_updated_sites.js b/pb/pb_migrations/1737360418_updated_sites.js
new file mode 100644
index 0000000..8bbaa6f
--- /dev/null
+++ b/pb/pb_migrations/1737360418_updated_sites.js
@@ -0,0 +1,41 @@
+///
+migrate((app) => {
+ const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+ // remove field
+ collection.fields.removeById("number1152796692")
+
+ // remove field
+ collection.fields.removeById("bool806155165")
+
+ return app.save(collection)
+}, (app) => {
+ const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+ // add field
+ collection.fields.addAt(2, new Field({
+ "hidden": false,
+ "id": "number1152796692",
+ "max": null,
+ "min": null,
+ "name": "per_hours",
+ "onlyInt": false,
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "number"
+ }))
+
+ // add field
+ collection.fields.addAt(3, new Field({
+ "hidden": false,
+ "id": "bool806155165",
+ "name": "activated",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "bool"
+ }))
+
+ return app.save(collection)
+})
diff --git a/pb/pb_migrations/1737360470_updated_sites.js b/pb/pb_migrations/1737360470_updated_sites.js
new file mode 100644
index 0000000..3a6707e
--- /dev/null
+++ b/pb/pb_migrations/1737360470_updated_sites.js
@@ -0,0 +1,29 @@
+///
+migrate((app) => {
+ const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+ // add field
+ collection.fields.addAt(2, new Field({
+ "hidden": false,
+ "id": "select2363381545",
+ "maxSelect": 1,
+ "name": "type",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "select",
+ "values": [
+ "web",
+ "rss"
+ ]
+ }))
+
+ return app.save(collection)
+}, (app) => {
+ const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+ // remove field
+ collection.fields.removeById("select2363381545")
+
+ return app.save(collection)
+})
diff --git a/pb/pb_migrations/1737360517_updated_focus_points.js b/pb/pb_migrations/1737360517_updated_focus_points.js
new file mode 100644
index 0000000..d7581a1
--- /dev/null
+++ b/pb/pb_migrations/1737360517_updated_focus_points.js
@@ -0,0 +1,24 @@
+///
+migrate((app) => {
+ const collection = app.findCollectionByNameOrId("pbc_3385864241")
+
+ // remove field
+ collection.fields.removeById("bool806155165")
+
+ return app.save(collection)
+}, (app) => {
+ const collection = app.findCollectionByNameOrId("pbc_3385864241")
+
+ // add field
+ collection.fields.addAt(3, new Field({
+ "hidden": false,
+ "id": "bool806155165",
+ "name": "activated",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "bool"
+ }))
+
+ return app.save(collection)
+})
diff --git a/pb/pb_migrations/1737360603_created_task.js b/pb/pb_migrations/1737360603_created_task.js
new file mode 100644
index 0000000..0f27d7d
--- /dev/null
+++ b/pb/pb_migrations/1737360603_created_task.js
@@ -0,0 +1,78 @@
+///
+migrate((app) => {
+ const collection = new Collection({
+ "createRule": null,
+ "deleteRule": null,
+ "fields": [
+ {
+ "autogeneratePattern": "[a-z0-9]{15}",
+ "hidden": false,
+ "id": "text3208210256",
+ "max": 15,
+ "min": 15,
+ "name": "id",
+ "pattern": "^[a-z0-9]+$",
+ "presentable": false,
+ "primaryKey": true,
+ "required": true,
+ "system": true,
+ "type": "text"
+ },
+ {
+ "hidden": false,
+ "id": "bool806155165",
+ "name": "activated",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "bool"
+ },
+ {
+ "hidden": false,
+ "id": "number3171882809",
+ "max": null,
+ "min": null,
+ "name": "per_hour",
+ "onlyInt": false,
+ "presentable": false,
+ "required": true,
+ "system": false,
+ "type": "number"
+ },
+ {
+ "hidden": false,
+ "id": "autodate2990389176",
+ "name": "created",
+ "onCreate": true,
+ "onUpdate": false,
+ "presentable": false,
+ "system": false,
+ "type": "autodate"
+ },
+ {
+ "hidden": false,
+ "id": "autodate3332085495",
+ "name": "updated",
+ "onCreate": true,
+ "onUpdate": true,
+ "presentable": false,
+ "system": false,
+ "type": "autodate"
+ }
+ ],
+ "id": "pbc_1970519189",
+ "indexes": [],
+ "listRule": null,
+ "name": "task",
+ "system": false,
+ "type": "base",
+ "updateRule": null,
+ "viewRule": null
+ });
+
+ return app.save(collection);
+}, (app) => {
+ const collection = app.findCollectionByNameOrId("pbc_1970519189");
+
+ return app.delete(collection);
+})
diff --git a/pb/pb_migrations/1737360712_updated_task.js b/pb/pb_migrations/1737360712_updated_task.js
new file mode 100644
index 0000000..084c86e
--- /dev/null
+++ b/pb/pb_migrations/1737360712_updated_task.js
@@ -0,0 +1,65 @@
+///
+migrate((app) => {
+ const collection = app.findCollectionByNameOrId("pbc_1970519189")
+
+ // add field
+ collection.fields.addAt(3, new Field({
+ "cascadeDelete": false,
+ "collectionId": "pbc_3385864241",
+ "hidden": false,
+ "id": "relation2655548471",
+ "maxSelect": 999,
+ "minSelect": 0,
+ "name": "focus_points",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "relation"
+ }))
+
+ // add field
+ collection.fields.addAt(4, new Field({
+ "cascadeDelete": false,
+ "collectionId": "pbc_2001081480",
+ "hidden": false,
+ "id": "relation3154160227",
+ "maxSelect": 999,
+ "minSelect": 0,
+ "name": "sites",
+ "presentable": false,
+ "required": false,
+ "system": false,
+ "type": "relation"
+ }))
+
+ // add field
+ collection.fields.addAt(5, new Field({
+ "autogeneratePattern": "",
+ "hidden": false,
+ "id": "text2870082381",
+ "max": 0,
+ "min": 0,
+ "name": "search_engine_keywords",
+ "pattern": "",
+ "presentable": false,
+ "primaryKey": false,
+ "required": false,
+ "system": false,
+ "type": "text"
+ }))
+
+ return app.save(collection)
+}, (app) => {
+ const collection = app.findCollectionByNameOrId("pbc_1970519189")
+
+ // remove field
+ collection.fields.removeById("relation2655548471")
+
+ // remove field
+ collection.fields.removeById("relation3154160227")
+
+ // remove field
+ collection.fields.removeById("text2870082381")
+
+ return app.save(collection)
+})
diff --git a/test/pre_process_test.py b/test/pre_process_test.py
index bf969f8..881a4b8 100644
--- a/test/pre_process_test.py
+++ b/test/pre_process_test.py
@@ -10,6 +10,9 @@ sys.path.append(core_path)
from scrapers import *
from agents.get_info import pre_process
+
+save_dir = 'webpage_samples'
+
def check_url_text(text):
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
print(f"processing: {text}")
@@ -118,7 +121,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--test_file', '-F', type=str, default='')
parser.add_argument('--sample_dir', '-D', type=str, default='')
- parser.add_argument('--record_folder', '-R', type=str, default='')
+ parser.add_argument('--record_folder', '-R', type=str, default=save_dir)
args = parser.parse_args()
test_file = args.test_file