rss and search study

2025-01-23 02:20:20 +08:00 · 2025-01-21 22:46:45 +08:00 · 2025-01-21 22:46:45 +08:00 · 50332b1a09
commit 50332b1a09
parent 7ac3b6f23e
11 changed files with 353 additions and 6 deletions
--- a/core/agents/get_info.py
+++ b/core/agents/get_info.py
@ -163,7 +163,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
        if ratio > 0.05:
            if test_mode:
                print('this is a navigation section, will be removed')
-                print(ratio)
+                print(ratio, '\n')
                print(section_remain)
                print('-' * 50)
            sections = sections[1:]
@ -172,7 +172,7 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
        section_remain_len = len(section_remain)
        if section_remain_len < 198:
            if test_mode:
-                print('this is a footer section, will be removed')
+                print('this is a footer section, will be removed\n')
                print(section_remain_len)
                print(section_remain)
                print('-' * 50)
@ -185,14 +185,14 @@ async def pre_process(raw_markdown: str, base_url: str, used_img: list[str],
        if ratio < 70:
            if test_mode:
                print('this is a links part')
-                print(ratio)
+                print(ratio, '\n')
                print(text)
                print('-' * 50)
            links_parts.append(text)
        else:
            if test_mode:
                print('this is a content part')
-                print(ratio)
+                print(ratio, '\n')
                print(text)
                print('-' * 50)
            contents.append(text)
--- a/core/connects/init.py
+++ b/core/connects/init.py
@ -0,0 +1,54 @@
+from exa_search import search_with_exa
+import time
+from pprint import pprint
+import requests
+import uuid
+
+api_key = ''
+
+def run_v4_sync(query: str):
+    msg = [
+        {
+            "role": "user",
+            "content": query
+        }
+    ]
+    tool = "web-search-pro"
+    url = "https://open.bigmodel.cn/api/paas/v4/tools"
+    request_id = str(uuid.uuid4())
+    data = {
+        "request_id": request_id,
+        "tool": tool,
+        "stream": False,
+        "messages": msg
+    }
+
+    resp = requests.post(
+        url,
+        json=data,
+        headers={'Authorization': api_key},
+        timeout=300
+    )
+    result = resp.json()
+    return result['choices'][0]['message']
+
+
+test_list = ['广东全省的台风预警——仅限2024年的信息',
+             '大模型技术突破与创新——包括新算法与模型，新的研究成果',
+             '事件图谱方面的知识',
+             '人工智能领军人物介绍',
+             '社区治理',
+             '新获批的氢能项目——60万吨级别以上',
+             '氢能项目招标信息——2024年12月以后',
+             '各地住宅网签最新数据——2025年1月6日以后']
+
+for query in test_list:
+    print(query)
+    print('\n')
+    print('test bigmodel...')
+    start_time = time.time()
+    print(run_v4_sync(query))
+    end_time = time.time()
+    print(f"bigmodel time: {end_time - start_time}")
+    print('\n')
+    print('*' * 25)
--- a/core/connects/exa_search.py
+++ b/core/connects/exa_search.py
@ -0,0 +1,31 @@
+import httpx
+
+headers = {
+    "x-api-key": "",
+    "Content-Type": "application/json"
+}
+
+def search_with_exa(query: str) -> str:
+    url = "https://api.exa.ai/search"
+    
+    payload = {
+        "query": query,
+        "useAutoprompt": True,
+        "type": "auto",
+        "category": "news",
+        "numResults": 5,
+        "startCrawlDate": "2024-12-01T00:00:00.000Z",
+        "endCrawlDate": "2025-01-21T00:00:00.000Z",
+        "startPublishedDate": "2024-12-01T00:00:00.000Z", 
+        "endPublishedDate": "2025-01-21T00:00:00.000Z",
+        "contents": {
+            "text": {
+                "maxCharacters": 1000,
+                "includeHtmlTags": False
+            },
+            "livecrawl": "always",
+        }
+    }
+    
+    response = httpx.post(url, json=payload, headers=headers, timeout=30)
+    return response.text
--- a/core/connects/rss_connect.py
+++ b/core/connects/rss_connect.py
@ -0,0 +1,21 @@
+import feedparser
+from loguru import logger
+import os, sys
+import json
+from urllib.parse import urlparse
+
+core_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
+sys.path.append(core_path)
+
+from utils.general_utils import isURL
+
+
+def get_links_from_rss(rss_url: str, existing_urls: set, _logger: logger = None) -> [str]:
+    try:
+        feed = feedparser.parse(rss_url)
+    except Exception as e:
+        if _logger:
+            _logger.warning(f"RSS feed is not valid: {e}")
+        return []
+    
+    return [entry.link for entry in feed.entries if entry.link and entry.link not in existing_urls]
--- a/core/requirements.txt
+++ b/core/requirements.txt
@ -5,4 +5,5 @@ pydantic
 #json_repair==0.*
 beautifulsoup4
 requests
-crawl4ai==0.4.247
+crawl4ai==0.4.247
+feedparser==6.0.11
--- a/pb/pb_migrations/1737360418_updated_sites.js
+++ b/pb/pb_migrations/1737360418_updated_sites.js
@ -0,0 +1,41 @@
+/// <reference path="../pb_data/types.d.ts" />
+migrate((app) => {
+  const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+  // remove field
+  collection.fields.removeById("number1152796692")
+
+  // remove field
+  collection.fields.removeById("bool806155165")
+
+  return app.save(collection)
+}, (app) => {
+  const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+  // add field
+  collection.fields.addAt(2, new Field({
+    "hidden": false,
+    "id": "number1152796692",
+    "max": null,
+    "min": null,
+    "name": "per_hours",
+    "onlyInt": false,
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "number"
+  }))
+
+  // add field
+  collection.fields.addAt(3, new Field({
+    "hidden": false,
+    "id": "bool806155165",
+    "name": "activated",
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "bool"
+  }))
+
+  return app.save(collection)
+})
--- a/pb/pb_migrations/1737360470_updated_sites.js
+++ b/pb/pb_migrations/1737360470_updated_sites.js
@ -0,0 +1,29 @@
+/// <reference path="../pb_data/types.d.ts" />
+migrate((app) => {
+  const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+  // add field
+  collection.fields.addAt(2, new Field({
+    "hidden": false,
+    "id": "select2363381545",
+    "maxSelect": 1,
+    "name": "type",
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "select",
+    "values": [
+      "web",
+      "rss"
+    ]
+  }))
+
+  return app.save(collection)
+}, (app) => {
+  const collection = app.findCollectionByNameOrId("pbc_2001081480")
+
+  // remove field
+  collection.fields.removeById("select2363381545")
+
+  return app.save(collection)
+})
--- a/pb/pb_migrations/1737360517_updated_focus_points.js
+++ b/pb/pb_migrations/1737360517_updated_focus_points.js
@ -0,0 +1,24 @@
+/// <reference path="../pb_data/types.d.ts" />
+migrate((app) => {
+  const collection = app.findCollectionByNameOrId("pbc_3385864241")
+
+  // remove field
+  collection.fields.removeById("bool806155165")
+
+  return app.save(collection)
+}, (app) => {
+  const collection = app.findCollectionByNameOrId("pbc_3385864241")
+
+  // add field
+  collection.fields.addAt(3, new Field({
+    "hidden": false,
+    "id": "bool806155165",
+    "name": "activated",
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "bool"
+  }))
+
+  return app.save(collection)
+})
--- a/pb/pb_migrations/1737360603_created_task.js
+++ b/pb/pb_migrations/1737360603_created_task.js
@ -0,0 +1,78 @@
+/// <reference path="../pb_data/types.d.ts" />
+migrate((app) => {
+  const collection = new Collection({
+    "createRule": null,
+    "deleteRule": null,
+    "fields": [
+      {
+        "autogeneratePattern": "[a-z0-9]{15}",
+        "hidden": false,
+        "id": "text3208210256",
+        "max": 15,
+        "min": 15,
+        "name": "id",
+        "pattern": "^[a-z0-9]+$",
+        "presentable": false,
+        "primaryKey": true,
+        "required": true,
+        "system": true,
+        "type": "text"
+      },
+      {
+        "hidden": false,
+        "id": "bool806155165",
+        "name": "activated",
+        "presentable": false,
+        "required": false,
+        "system": false,
+        "type": "bool"
+      },
+      {
+        "hidden": false,
+        "id": "number3171882809",
+        "max": null,
+        "min": null,
+        "name": "per_hour",
+        "onlyInt": false,
+        "presentable": false,
+        "required": true,
+        "system": false,
+        "type": "number"
+      },
+      {
+        "hidden": false,
+        "id": "autodate2990389176",
+        "name": "created",
+        "onCreate": true,
+        "onUpdate": false,
+        "presentable": false,
+        "system": false,
+        "type": "autodate"
+      },
+      {
+        "hidden": false,
+        "id": "autodate3332085495",
+        "name": "updated",
+        "onCreate": true,
+        "onUpdate": true,
+        "presentable": false,
+        "system": false,
+        "type": "autodate"
+      }
+    ],
+    "id": "pbc_1970519189",
+    "indexes": [],
+    "listRule": null,
+    "name": "task",
+    "system": false,
+    "type": "base",
+    "updateRule": null,
+    "viewRule": null
+  });
+
+  return app.save(collection);
+}, (app) => {
+  const collection = app.findCollectionByNameOrId("pbc_1970519189");
+
+  return app.delete(collection);
+})
--- a/pb/pb_migrations/1737360712_updated_task.js
+++ b/pb/pb_migrations/1737360712_updated_task.js
@ -0,0 +1,65 @@
+/// <reference path="../pb_data/types.d.ts" />
+migrate((app) => {
+  const collection = app.findCollectionByNameOrId("pbc_1970519189")
+
+  // add field
+  collection.fields.addAt(3, new Field({
+    "cascadeDelete": false,
+    "collectionId": "pbc_3385864241",
+    "hidden": false,
+    "id": "relation2655548471",
+    "maxSelect": 999,
+    "minSelect": 0,
+    "name": "focus_points",
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "relation"
+  }))
+
+  // add field
+  collection.fields.addAt(4, new Field({
+    "cascadeDelete": false,
+    "collectionId": "pbc_2001081480",
+    "hidden": false,
+    "id": "relation3154160227",
+    "maxSelect": 999,
+    "minSelect": 0,
+    "name": "sites",
+    "presentable": false,
+    "required": false,
+    "system": false,
+    "type": "relation"
+  }))
+
+  // add field
+  collection.fields.addAt(5, new Field({
+    "autogeneratePattern": "",
+    "hidden": false,
+    "id": "text2870082381",
+    "max": 0,
+    "min": 0,
+    "name": "search_engine_keywords",
+    "pattern": "",
+    "presentable": false,
+    "primaryKey": false,
+    "required": false,
+    "system": false,
+    "type": "text"
+  }))
+
+  return app.save(collection)
+}, (app) => {
+  const collection = app.findCollectionByNameOrId("pbc_1970519189")
+
+  // remove field
+  collection.fields.removeById("relation2655548471")
+
+  // remove field
+  collection.fields.removeById("relation3154160227")
+
+  // remove field
+  collection.fields.removeById("text2870082381")
+
+  return app.save(collection)
+})
--- a/test/pre_process_test.py
+++ b/test/pre_process_test.py
@ -10,6 +10,9 @@ sys.path.append(core_path)
 from scrapers import *
 from agents.get_info import pre_process

+
+save_dir = 'webpage_samples'
+
 def check_url_text(text):
    common_chars = ',.!;:，；：、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
    print(f"processing: {text}")
@ -118,7 +121,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_file', '-F', type=str, default='')
    parser.add_argument('--sample_dir', '-D', type=str, default='')
-    parser.add_argument('--record_folder', '-R', type=str, default='')
+    parser.add_argument('--record_folder', '-R', type=str, default=save_dir)
    args = parser.parse_args()

    test_file = args.test_file