update manuel

2025-02-02 18:28:46 +08:00 · 2024-04-16 14:49:25 +08:00 · 2024-04-16 14:49:25 +08:00 · cf709d7f05
commit cf709d7f05
parent 76f69e5e26 1226e2d70a
14 changed files with 129 additions and 210 deletions
--- a/asset/wiseflow_arch.png
+++ b/asset/wiseflow_arch.png
--- a/client/.dockerignore
+++ b/client/.dockerignore
@ -11,4 +11,4 @@ docker-compose.yaml
 Dockerfile
 README.md
 backend/__pycache__
-backend/AWtest
+backend/WStest
--- a/client/.gitignore
+++ b/client/.gitignore
@ -1,4 +1,4 @@
 .env
 .venv/
 pb/pb_data/
-backend/AWtest/
+backend/WStest/
--- a/client/README.md
+++ b/client/README.md
@ -18,16 +18,21 @@
 - character 以什么身份挖掘线索（这决定了llm的关注点和立场）
 - focus 关注什么方面的线索
 - focus_type 线索类型
- good_samples 你希望llm给出的线索描述模式（给两个sample）
+- good_samples1 你希望llm给出的线索描述模式（给两个sample）
 - good_samples2 你希望llm给出的线索描述模式（给两个sample）
 - bad_samples 规避的线索描述模式
 - report_type 报告类型
- 【sites] 大类下面列出你的信源。一行一个网址。
+### 4、编辑 sites.txt 文件
 这个文件指定了需要本地执行的监控的信源，一行一个网址，支持随时更改，每次执行任务前会读取最新的。
 如果你只爬取配置了专有爬虫的信源的话，可以直接编辑scrapers/__init__.py 中的scraper_map，这里都留空就好
 专有爬虫的说明见 backend/scrapers/README.md
 **注：虽然wiseflow client配置了通用爬虫，对于新闻类静态网页有一定的爬取和解析效果，但我们还是强烈建议使用我们的数据订阅服务或者自写专业爬虫。**
 ## 参考：各服务注册地址
 - 阿里灵积大模型接口：https://dashscope.aliyun.com/
--- a/client/backend/background_task.py
+++ b/client/backend/background_task.py
@ -4,22 +4,13 @@
 import schedule
 import time
 from work_process import ServiceProcesser
 import configparser
 config = configparser.ConfigParser()
 config.read('../config.ini')
 if config.has_section('sites'):
    web_pages = config['sites']
    urls = [value for key, value in web_pages.items()]
 else:
    urls = []
 sp = ServiceProcesser()
 sp(sites=urls)
-'''
+
 def task():
    with open('../sites.txt', 'r', encoding='utf-8') as f:
        urls = [line.strip() for line in f.readlines() if line.strip()]
    sp(sites=urls)
@ -29,6 +20,3 @@ schedule.every().day.at("01:17").do(task)
 while True:
    schedule.run_pending()
    time.sleep(60)
 site1 = https://www.welivesecurity.com/en/
 site2 = https://www.scmagazine.com/
 '''
--- a/client/backend/general_utils.py
+++ b/client/backend/general_utils.py
@ -125,6 +125,25 @@ def is_chinese(string):
    return (non_chinese_count/len(string)) < 0.68
 def extract_and_convert_dates(input_string):
    # 定义匹配不同日期格式的正则表达式
    patterns = [
        r'(\d{4})-(\d{2})-(\d{2})',  # 匹配YYYY-MM-DD格式
        r'(\d{4})/(\d{2})/(\d{2})',  # 匹配YYYY/MM/DD格式
        r'(\d{4})\.(\d{2})\.(\d{2})',  # 匹配YYYY.MM.DD格式
        r'(\d{4})\\(\d{2})\\(\d{2})',  # 匹配YYYY\MM\DD格式
        r'(\d{4})(\d{2})(\d{2})'  # 匹配YYYYMMDD格式
    ]
    matches = []
    for pattern in patterns:
        matches = re.findall(pattern, input_string)
        if matches:
            break
    if matches:
        return ''.join(matches[0])
    return None
 """
 # from InternLM/huixiangdou 
 # another awsome work
--- a/client/backend/get_search.py
+++ b/client/backend/get_search.py
@ -85,7 +85,7 @@ def search_insight(keyword: str, exist_urls: list[Union[str, Path]], knowledge:
        if url in exist_urls:
            continue
        exist_urls.append(url)
-        flag, value = simple_crawler(url, logger)
+        flag, value = simple_crawler(url)
        if flag != 11:
            continue
        from_site = urlparse(url).netloc
--- a/client/backend/scrapers/README.md
+++ b/client/backend/scrapers/README.md
@ -15,7 +15,6 @@
 输入：
 - expiration： datetime的date.date()对象，爬虫应该只抓取这之后（含这一天）的文章
 - existings：[str], 数据库已有文章的url列表，爬虫应该忽略这个列表里面的url
 - logger：主流程的logger对象，如果爬虫需要单独logger，这个logger接收了可以不用
 输出：
 - [dict]，返回结果列表，每个dict代表一个文章，格式如下：
--- a/client/backend/scrapers/general_scraper.py
+++ b/client/backend/scrapers/general_scraper.py
@ -2,18 +2,22 @@ from pathlib import Path
 from urllib.parse import urlparse
 import re
 from .simple_crawler import simple_crawler
-import json
+import httpx
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from llms.dashscope_wrapper import dashscope_llm
 from datetime import datetime, date
 from requests.compat import urljoin
 import chardet
 from general_utils import extract_and_convert_dates
 from get_logger import get_logger
 import os
 header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
 project_dir = os.environ.get("PROJECT_DIR", "")
 logger = get_logger(name='general_scraper', file=os.path.join(project_dir, f'general_scraper.log'))
 def tag_visible(element: Comment) -> bool:
@ -35,83 +39,56 @@ def text_from_soup(soup: BeautifulSoup) -> str:
 def parse_html_content(out: str) -> dict:
-    # 发现llm出来的结果有时会在键值或者内容的引号外面出现\n \t安全起见全部去除，反正后续分析时llm也不看内容的换行这些
+    dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
    pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
    result = pattern.findall(out)
-    out = result[0]
+    result = result[0].strip()
-    dict_str = out.strip("```").strip("python").strip("json").strip()
+    dict_strs = result.split('||')
-    dict_str = dict_str.replace("\n", "").replace("\t", "")
+    if not dict_strs:
-    # 先正则解析出{}中的内容
+        dict_strs = result.split('|||')
-    dict_str = re.findall(r'{(.*?)}', dict_str)
+        if not dict_strs:
-    # dict_str = dict_str[0].replace("'", '"') #会误伤
+            return dct
-    # json loads 要求双引号, 且需要把\n等转译
+    if len(dict_strs) == 3:
-    dct = json.loads('{' + dict_str[0] + '}')
+        dct['title'] = dict_strs[0].strip()
-    date_str = re.findall(r"\d{4}-\d{2}-\d{2}", dct['publish_time'])
+        dct['content'] = dict_strs[1].strip()
-    if date_str:
+    elif len(dict_strs) == 4:
-        dct['publish_time'] = date_str[0].replace("-", "")
+        dct['title'] = dict_strs[0].strip()
        dct['content'] = dict_strs[2].strip()
        dct['abstract'] = dict_strs[1].strip()
    else:
-        date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", dct['publish_time'])
+        return dct
    date_str = extract_and_convert_dates(dict_strs[-1])
    if date_str:
-            dct['publish_time'] = date_str[0].replace(".", "")
+        dct['publish_time'] = date_str
        else:
            date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
            if date_str:
                dct['publish_time'] = date_str[0]
    else:
        dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
    return dct
-sys_info = '''你是一个html网页解析器，你将接收一段用户从网页html文件中提取的文本，请解析出其标题、摘要、内容和发布日期。
+# qwen1.5-72b解析json格式太容易出错，网页上的情况太多，比如经常直接使用英文的"，这样后面json.loads就容易出错……
-发布日期的格式为：XXXX-XX-XX，如果找不到则为空。内容不要包含标题、作者和发布日期。
+sys_info = '''你是一个html网页解析器，你将接收一段用户从网页html文件中提取的文本，请解析出其标题、摘要、内容和发布日期，发布日期格式为YYYY-MM-DD。
-请务必按照Python字典的格式输出，key和value使用双引号包裹，key分别为：title、abstract、content和publish_time。输出结果请整体用三引号包裹，如下所示：
+结果请按照以下格式返回（整体用三引号包裹）：
 """
-{"title": "解析出的标题", "abstract": "解析出的摘要", "content": "解析出的内容", "publish_time": "解析出的发布日期XXXX-XX-XX"}
+标题||摘要||内容||发布日期XXXX-XX-XX
-"""'''
+"""
 '''
-def llm_crawler(url: str | Path, logger=None) -> (int, dict):
+def llm_crawler(url: str | Path) -> (int, dict):
    """
    返回文章信息dict和flag，负数为报错，0为没有结果，11为成功
    参考：https://mp.weixin.qq.com/s/4J-kofsfFDiV1FxGlTJLfA
    测试URL：
    url = "https://so.html5.qq.com/page/real/search_news?docid=70000021_40665eb6afe80152"
    url = "https://mp.weixin.qq.com/s?src=11&timestamp=1709999167&ver=5128&signature=e0Tssc4COc*p-RkKaPwUMrGePUxko8N621VxARnI8uKDg*l5C7Z8gBC6RDUAnyGqvmzJ5WEzvaO-T7GvMRw9LwNaJS3Hh2tyaITdmsaVtY9JsSmsidX6u4SqxirGsRdo&new=1"
    """
    # 发送 HTTP 请求获取网页内容
    try:
-        response = requests.get(url, timeout=60)
+        with httpx.Client() as client:
-    except:
+            response = client.get(url, headers=header, timeout=30)
        if logger:
            logger.error(f"cannot connect {url}")
        else:
            print(f"cannot connect {url}")
        return -7, {}
    if response.status_code != 200:
        if logger:
            logger.error(f"cannot connect {url}")
        else:
            print(f"cannot connect {url}")
        return -7, {}
            rawdata = response.content
            encoding = chardet.detect(rawdata)['encoding']
    if encoding is not None and encoding.lower() == 'utf-8':
        try:
            text = rawdata.decode(encoding)
-        except:
+    except Exception as e:
-            if logger:
+        logger.error(e)
-                logger.error(f"{url} decode error, aborting")
+        return -7, {}
            else:
                print(f"{url} decode error, aborting")
            return 0, {}
    else:
        if logger:
            logger.error(f"{url} undetected coding, aborting")
        else:
            print(f"{url} undetected coding, aborting")
        return 0, {}
    # 使用 BeautifulSoup 解析 HTML 内容
    soup = BeautifulSoup(text, "html.parser")
@ -120,17 +97,12 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
    html_lines = [line.strip() for line in html_lines if line.strip()]
    html_text = "\n".join(html_lines)
    if len(html_text) > 29999:
        if logger:
        logger.warning(f"{url} content too long for llm parsing")
        else:
            print(f"{url} content too long for llm parsing")
        return 0, {}
-    if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403'):
+    if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403')\
-        if logger:
+            or html_text.startswith('出错了'):
        logger.warning(f"can not get {url} from the Internet")
        else:
            print(f"can not get {url} from the Internet")
        return -7, {}
    messages = [
@ -140,19 +112,13 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
    llm_output = dashscope_llm(messages, "qwen1.5-72b-chat", logger=logger)
    try:
        info = parse_html_content(llm_output)
-    except Exception as e:
+    except Exception:
        msg = f"can not parse {llm_output}"
-        if logger:
+        logger.debug(msg)
            logger.warning(msg)
        else:
            print(msg)
        return 0, {}
-    if len(info['title']) < 5 or len(info['content']) < 24:
+    if len(info['title']) < 4 or len(info['content']) < 24:
-        if logger:
+        logger.debug(f"{info} not valid")
            logger.warning(f"{info} not valid")
        else:
            print(f"{info} not valid")
        return 0, {}
    info["url"] = str(url)
@ -185,21 +151,12 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
    return 11, info
-def general_scraper(site: str, expiration: date, existing: list[str], logger=None) -> list[dict]:
+def general_scraper(site: str, expiration: date, existing: list[str]) -> list[dict]:
    try:
-        response = requests.get(site, header, timeout=60)
+        with httpx.Client() as client:
-    except:
+            response = client.get(site, headers=header, timeout=30)
-        if logger:
+    except Exception as e:
-            logger.error(f"cannot connect {site}")
+        logger.error(e)
        else:
            print(f"cannot connect {site}")
        return []
    if response.status_code != 200:
        if logger:
            logger.error(f"cannot connect {site}")
        else:
            print(f"cannot connect {site}")
        return []
    page_source = response.text
@ -209,25 +166,18 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger=Non
    base_url = parsed_url.scheme + '://' + parsed_url.netloc
    urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
    if not urls:
        if logger:
        logger.warning(f"can not find any link from {site}, maybe it's an article site...")
        if site in existing:
-            if logger:
+            logger.debug(f"{site} has been crawled before, skip it")
                logger.warning(f"{site} has been crawled before, skip it")
            else:
                print(f"{site} has been crawled before, skip it")
            return []
-        flag, result = simple_crawler(site, logger=logger)
+        flag, result = simple_crawler(site)
        if flag != 11:
-            flag, result = llm_crawler(site, logger=logger)
+            flag, result = llm_crawler(site)
            if flag != 11:
                return []
        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
-            if logger:
+            logger.debug(f"{site} is too old, skip it")
                logger.warning(f"{site} is too old, skip it")
            else:
                print(f"{site} is too old, skip it")
            return []
        else:
            return [result]
@ -235,23 +185,17 @@ def general_scraper(site: str, expiration: date, existing: list[str], logger=Non
    articles = []
    for url in urls:
        if url in existing:
-            if logger:
+            logger.debug(f"{url} has been crawled before, skip it")
                logger.warning(f"{url} has been crawled before, skip it")
            else:
                print(f"{url} has been crawled before, skip it")
            continue
        existing.append(url)
-        flag, result = simple_crawler(url, logger=logger)
+        flag, result = simple_crawler(url)
        if flag != 11:
-            flag, result = llm_crawler(url, logger=logger)
+            flag, result = llm_crawler(url)
            if flag != 11:
                continue
        publish_date = datetime.strptime(result['publish_time'], '%Y%m%d')
        if publish_date.date() < expiration:
-            if logger:
+            logger.debug(f"{url} is too old, skip it")
                logger.warning(f"{url} is too old, skip it")
            else:
                print(f"{url} is too old, skip it")
        else:
            articles.append(result)
--- a/client/backend/scrapers/simple_crawler.py
+++ b/client/backend/scrapers/simple_crawler.py
@ -1,88 +1,52 @@
 from gne import GeneralNewsExtractor
-import requests
+import httpx
 from bs4 import BeautifulSoup
 from datetime import datetime
 from pathlib import Path
-import re
+from general_utils import extract_and_convert_dates
 import chardet
-
+from get_logger import get_logger
 import os
 extractor = GeneralNewsExtractor()
 header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
 project_dir = os.environ.get("PROJECT_DIR", "")
 logger = get_logger(name='simple_crawler', file=os.path.join(project_dir, f'simple_crawler.log'))
-def simple_crawler(url: str | Path, logger=None) -> (int, dict):
+
 def simple_crawler(url: str | Path) -> (int, dict):
    """
    返回文章信息dict和flag，负数为报错，0为没有结果，11为成功
    """
    try:
-        response = requests.get(url, header, timeout=60)
+        with httpx.Client() as client:
-    except:
+            response = client.get(url, headers=header, timeout=30)
        if logger:
            logger.error(f"cannot connect {url}")
        else:
            print(f"cannot connect {url}")
        return -7, {}
    if response.status_code != 200:
        if logger:
            logger.error(f"cannot connect {url}")
        else:
            print(f"cannot connect {url}")
        return -7, {}
            rawdata = response.content
            encoding = chardet.detect(rawdata)['encoding']
    if encoding is not None and encoding.lower() == 'utf-8':
        try:
            text = rawdata.decode(encoding)
-        except:
+    except Exception as e:
-            if logger:
+        logger.error(e)
-                logger.error(f"{url} decode error, aborting")
+        return -7, {}
            else:
                print(f"{url} decode error, aborting")
            return 0, {}
    else:
        if logger:
            logger.error(f"{url} undetected coding, aborting")
        else:
            print(f"{url} undetected coding, aborting")
        return 0, {}
    result = extractor.extract(text)
    if not result:
        if logger:
        logger.error(f"gne cannot extract {url}")
        else:
            print(f"gne cannot extract {url}")
        return 0, {}
-    if len(result['title']) < 5 or len(result['content']) < 24:
+    if len(result['title']) < 4 or len(result['content']) < 24:
-        if logger:
+        logger.info(f"{result} not valid")
            logger.warning(f"{result} not valid")
        else:
            print(f"{result} not valid")
        return 0, {}
-    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
+    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
-        if logger:
+            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
        logger.warning(f"can not get {url} from the Internet")
        else:
            print(f"can not get {url} from the Internet")
        return -7, {}
-    date_str = re.findall(r"\d{4}-\d{2}-\d{2}", result['publish_time'])
+    date_str = extract_and_convert_dates(result['publish_time'])
    if date_str:
-        result['publish_time'] = date_str[0].replace("-", "")
+        result['publish_time'] = date_str
    else:
        date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", result['publish_time'])
        if date_str:
            result['publish_time'] = date_str[0].replace(".", "")
        else:
            date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
            if date_str:
                result['publish_time'] = date_str[0]
    else:
        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
@ -93,7 +57,7 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
            result['abstract'] = meta_description["content"]
        else:
            result['abstract'] = ''
-    except:
+    except Exception:
        result['abstract'] = ''
    result['url'] = str(url)
--- a/client/backend/work_process.py
+++ b/client/backend/work_process.py
@ -59,13 +59,13 @@ class ServiceProcesser:
        # 定义扫描源列表，如果不指定就默认遍历scraper_map, 另外这里还要考虑指定的source不在scraper_map的情况，这时应该使用通用爬虫
        sources = sites if sites else list(scraper_map.keys())
        new_articles = []
-        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for site in sources:
                if site in scraper_map:
-                    futures.append(executor.submit(scraper_map[site], expiration, existings, self.logger))
+                    futures.append(executor.submit(scraper_map[site], expiration, existings))
                else:
-                    futures.append(executor.submit(general_scraper, site, expiration, existings, self.logger))
+                    futures.append(executor.submit(general_scraper, site, expiration, existings))
            concurrent.futures.wait(futures)
            for future in futures:
                try:
--- a/client/config.ini
+++ b/client/config.ini
@ -7,8 +7,3 @@ good_sample1 = 黑客组织Rhysida声称已入侵中国国有能源公司
 good_sample2 = 差不多一百万份包含未成年人数据（包括家庭地址和照片）的文件对互联网上的任何人都开放，对孩子构成威胁
 bad_sample = 黑客组织活动最近频发
 report_type = 网络安全情报
 [sites]
 site3 = https://www.hackread.com/
 site2 = http://sh.people.com.cn/
 site1 = https://www.xuexi.cn/
--- a/client/sites.txt
+++ b/client/sites.txt
@ -0,0 +1,5 @@
 https://www.hackread.com/
 http://sh.people.com.cn/
 https://www.xuexi.cn/
 https://www.defensenews.com/
 https://www.meritalk.com
--- a/client/version
+++ b/client/version
@ -1 +1 @@
-v0.2.0
+v0.2.1