more strictly crawler filter

2025-02-02 18:28:46 +08:00 · 2024-04-08 17:58:29 +08:00 · 2024-04-08 17:58:29 +08:00 · 25abb316b3
commit 25abb316b3
parent 53fc3d0646
7 changed files with 101 additions and 30 deletions
--- a/client/backend/init.py
+++ b/client/backend/init.py
@ -178,16 +178,16 @@ class BackendService:
            return self.build_out(-2, 'insight not found')

        article_ids = insight[0]['articles']
-        if not article_ids:
-            self.logger.error(f'insight {insight_id} has no articles')
-            return self.build_out(-2, 'can not find articles for insight')
+        if article_ids:
+            article_list = [self.pb.read('articles', fields=['url'], filter=f'id="{_id}"') for _id in article_ids]
+            url_list = [_article[0]['url'] for _article in article_list if _article]
+        else:
+            url_list = []

-        article_list = [self.pb.read('articles', fields=['url'], filter=f'id="{_id}"') for _id in article_ids]
-        url_list = [_article[0]['url'] for _article in article_list if _article]
        flag, search_result = search_insight(insight[0]['content'], url_list, logger=self.logger)
        if flag <= 0:
            self.logger.debug('no search result, nothing happen')
-            return self.build_out(flag, '')
+            return self.build_out(flag, 'search engine error or no result')

        for item in search_result:
            new_article_id = self.pb.add(collection_name='articles', body=item)
--- a/client/backend/background_task.py
+++ b/client/backend/background_task.py
@ -16,8 +16,9 @@ else:
    urls = []

 sp = ServiceProcesser()
+sp(sites=urls)

-
+'''
 def task():
    sp(sites=urls)

@ -28,3 +29,6 @@ schedule.every().day.at("01:17").do(task)
 while True:
    schedule.run_pending()
    time.sleep(60)
+site1 = https://www.welivesecurity.com/en/
+site2 = https://www.scmagazine.com/
+'''
--- a/client/backend/llms/dashscope_wrapper.py
+++ b/client/backend/llms/dashscope_wrapper.py
@ -1,7 +1,7 @@
 # 使用aliyun dashscope的api封装
 # 非流式接口
 # 为了兼容性，输入输出都使用message格式（与openai SDK格式一致）
-
+import time
 from http import HTTPStatus
 import dashscope
 import random
@ -62,7 +62,6 @@ def dashscope_llm(messages: list,


 if __name__ == '__main__':
-    import time
    from pprint import pprint

    # logging.basicConfig(level=logging.DEBUG)
--- a/client/backend/scrapers/general_scraper.py
+++ b/client/backend/scrapers/general_scraper.py
@ -9,6 +9,7 @@ from bs4.element import Comment
 from llms.dashscope_wrapper import dashscope_llm
 from datetime import datetime, date
 from requests.compat import urljoin
+import chardet


 header = {
@ -35,6 +36,9 @@ def text_from_soup(soup: BeautifulSoup) -> str:

 def parse_html_content(out: str) -> dict:
    # 发现llm出来的结果有时会在键值或者内容的引号外面出现\n \t安全起见全部去除，反正后续分析时llm也不看内容的换行这些
+    pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
+    result = pattern.findall(out)
+    out = result[0]
    dict_str = out.strip("```").strip("python").strip("json").strip()
    dict_str = dict_str.replace("\n", "").replace("\t", "")
    # 先正则解析出{}中的内容
@ -46,19 +50,24 @@ def parse_html_content(out: str) -> dict:
    if date_str:
        dct['publish_time'] = date_str[0].replace("-", "")
    else:
-        date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
+        date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", dct['publish_time'])
        if date_str:
-            dct['publish_time'] = date_str[0]
+            dct['publish_time'] = date_str[0].replace(".", "")
        else:
-            dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
+            date_str = re.findall(r"\d{4}\d{2}\d{2}", dct['publish_time'])
+            if date_str:
+                dct['publish_time'] = date_str[0]
+            else:
+                dct['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
    return dct


-sys_info = """给你一段从网页html文件中提取的所有文本，请尝试输出其标题、摘要、内容和发布日期。
+sys_info = '''你是一个html网页解析器，你将接收一段用户从网页html文件中提取的文本，请解析出其标题、摘要、内容和发布日期。
 发布日期的格式为：XXXX-XX-XX，如果找不到则为空。内容不要包含标题、作者和发布日期。
-输出格式为Python字典，key分别为：title、abstract、content和publish_time。
-如果你识别出给定文本大部分既不是中文也不是英文，那么请输出：无法解析。
+请务必按照Python字典的格式输出，key和value使用双引号包裹，key分别为：title、abstract、content和publish_time。输出结果请整体用三引号包裹，如下所示：
 """
+{"title": "解析出的标题", "abstract": "解析出的摘要", "content": "解析出的内容", "publish_time": "解析出的发布日期XXXX-XX-XX"}
+"""'''


 def llm_crawler(url: str | Path, logger=None) -> (int, dict):
@ -85,13 +94,38 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
        else:
            print(f"cannot connect {url}")
        return -7, {}
-    # todo llm_crawler和simple_crawler这里都需要加个判断，对于乱码和总字符量很少的网页需要排除掉
+
+    rawdata = response.content
+    encoding = chardet.detect(rawdata)['encoding']
+    if encoding is not None and encoding.lower() == 'utf-8':
+        try:
+            text = rawdata.decode(encoding)
+        except:
+            if logger:
+                logger.error(f"{url} decode error, aborting")
+            else:
+                print(f"{url} decode error, aborting")
+            return 0, {}
+    else:
+        if logger:
+            logger.error(f"{url} undetected coding, aborting")
+        else:
+            print(f"{url} undetected coding, aborting")
+        return 0, {}
+
    # 使用 BeautifulSoup 解析 HTML 内容
-    soup = BeautifulSoup(response.text, "html.parser")
+    soup = BeautifulSoup(text, "html.parser")
    html_text = text_from_soup(soup)
    html_lines = html_text.split('\n')
    html_lines = [line.strip() for line in html_lines if line.strip()]
    html_text = "\n".join(html_lines)
+    if len(html_text) > 29999:
+        if logger:
+            logger.warning(f"{url} content too long for llm parsing")
+        else:
+            print(f"{url} content too long for llm parsing")
+        return 0, {}
+
    if not html_text or html_text.startswith('服务器错误') or html_text.startswith('您访问的页面') or html_text.startswith('403'):
        if logger:
            logger.warning(f"can not get {url} from the Internet")
@ -114,10 +148,14 @@ def llm_crawler(url: str | Path, logger=None) -> (int, dict):
            print(msg)
        return 0, {}

-    info["url"] = str(url)
-    if not info["title"] or not info["content"]:
+    if len(info['title']) < 5 or len(info['content']) < 24:
+        if logger:
+            logger.warning(f"{info} not valid")
+        else:
+            print(f"{info} not valid")
        return 0, {}

+    info["url"] = str(url)
    # 提取图片链接，提取不到就空着
    image_links = []
    images = soup.find_all("img")
--- a/client/backend/scrapers/simple_crawler.py
+++ b/client/backend/scrapers/simple_crawler.py
@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
 from datetime import datetime
 from pathlib import Path
 import re
+import chardet


 extractor = GeneralNewsExtractor()
@ -31,15 +32,39 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
            print(f"cannot connect {url}")
        return -7, {}

-    text = response.text
+    rawdata = response.content
+    encoding = chardet.detect(rawdata)['encoding']
+    if encoding is not None and encoding.lower() == 'utf-8':
+        try:
+            text = rawdata.decode(encoding)
+        except:
+            if logger:
+                logger.error(f"{url} decode error, aborting")
+            else:
+                print(f"{url} decode error, aborting")
+            return 0, {}
+    else:
+        if logger:
+            logger.error(f"{url} undetected coding, aborting")
+        else:
+            print(f"{url} undetected coding, aborting")
+        return 0, {}
+
    result = extractor.extract(text)
-    if not result or not result['title'] or not result['content']:
+    if not result:
        if logger:
            logger.error(f"gne cannot extract {url}")
        else:
            print(f"gne cannot extract {url}")
        return 0, {}

+    if len(result['title']) < 5 or len(result['content']) < 24:
+        if logger:
+            logger.warning(f"{result} not valid")
+        else:
+            print(f"{result} not valid")
+        return 0, {}
+
    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
        if logger:
            logger.warning(f"can not get {url} from the Internet")
@ -51,11 +76,15 @@ def simple_crawler(url: str | Path, logger=None) -> (int, dict):
    if date_str:
        result['publish_time'] = date_str[0].replace("-", "")
    else:
-        date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
+        date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", result['publish_time'])
        if date_str:
-            result['publish_time'] = date_str[0]
+            result['publish_time'] = date_str[0].replace(".", "")
        else:
-            result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
+            date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
+            if date_str:
+                result['publish_time'] = date_str[0]
+            else:
+                result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")

    soup = BeautifulSoup(text, "html.parser")
    try:
--- a/client/backend/work_process.py
+++ b/client/backend/work_process.py
@ -76,8 +76,6 @@ class ServiceProcesser:
        for value in new_articles:
            if not value:
                continue
-            if not value['title'] or not value['content']:
-                continue
            from_site = urlparse(value['url']).netloc
            from_site = from_site.replace('www.', '')
            from_site = from_site.split('.')[0]
@ -137,7 +135,10 @@ class ServiceProcesser:
                self.logger.info('generate_insights-warning: no insights and no more than 25 articles so use article title as insights')
                for key, value in cache.items():
                    if value['title']:
-                        text_for_insight = text_translate([value['title']], logger=self.logger)
+                        if is_chinese(value['title']):
+                            text_for_insight = value['title']
+                        else:
+                            text_for_insight = text_translate([value['title']], logger=self.logger)
                        if text_for_insight:
                            insight_id = self.pb.add(collection_name='insights',
                                                     body={'content': text_for_insight[0], 'articles': [key]})
--- a/client/config.ini
+++ b/client/config.ini
@ -9,6 +9,6 @@ bad_sample = 黑客组织活动最近频发
 report_type = 网络安全情报

 [sites]
-site1 = https://www.welivesecurity.com/en/
-site2 = https://www.scmagazine.com/
-site3 = https://business.sohu.com/
+site3 = https://www.hackread.com/
+site2 = http://sh.people.com.cn/
+site1 = https://www.xuexi.cn/