wiseflow/client/backend/scrapers/simple_crawler.py

from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from general_utils import extract_and_convert_dates
import chardet
from get_logger import get_logger
import os

extractor = GeneralNewsExtractor()
header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}

project_dir = os.environ.get("PROJECT_DIR", "")
os.makedirs(project_dir, exist_ok=True)
logger = get_logger(name='simple_crawler', file=os.path.join(project_dir, f'simple_crawler.log'))


def simple_crawler(url: str | Path) -> (int, dict):
    """
    返回文章信息dict和flag，负数为报错，0为没有结果，11为成功
    """
    try:
        with httpx.Client() as client:
            response = client.get(url, headers=header, timeout=30)
            rawdata = response.content
            encoding = chardet.detect(rawdata)['encoding']
            text = rawdata.decode(encoding)
        result = extractor.extract(text)
    except Exception as e:
        logger.warning(f"cannot get content from {url}\n{e}")
        return -7, {}

    if not result:
        logger.error(f"gne cannot extract {url}")
        return 0, {}

    if len(result['title']) < 4 or len(result['content']) < 24:
        logger.info(f"{result} not valid")
        return 0, {}

    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
        logger.warning(f"can not get {url} from the Internet")
        return -7, {}

    date_str = extract_and_convert_dates(result['publish_time'])
    if date_str:
        result['publish_time'] = date_str
    else:
        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")

    soup = BeautifulSoup(text, "html.parser")
    try:
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description:
            result['abstract'] = meta_description["content"]
        else:
            result['abstract'] = ''
    except Exception:
        result['abstract'] = ''

    result['url'] = str(url)
    return 11, result
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								from gne import GeneralNewsExtractor
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								import httpx
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								from bs4 import BeautifulSoup
 								from datetime import datetime
 								from pathlib import Path
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								from general_utils import extract_and_convert_dates
-												more strictly crawler filter

											
										
										
											2024-04-08 17:58:29 +08:00
+								import chardet
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								from get_logger import get_logger
 								import os
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
 								extractor = GeneralNewsExtractor()
 								header = {
 								        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								project_dir = os.environ.get("PROJECT_DIR", "")
-												use pb for config

											
										
										
											2024-04-17 14:02:25 +08:00
+								os.makedirs(project_dir, exist_ok=True)
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								logger = get_logger(name='simple_crawler', file=os.path.join(project_dir, f'simple_crawler.log'))
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								def simple_crawler(url: str | Path) -> (int, dict):
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								    """
 								    返回文章信息dict和flag，负数为报错，0为没有结果，11为成功
 								    """
 								    try:
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								        with httpx.Client() as client:
 								            response = client.get(url, headers=header, timeout=30)
 								            rawdata = response.content
 								            encoding = chardet.detect(rawdata)['encoding']
-												more strictly crawler filter

											
										
										
											2024-04-08 17:58:29 +08:00
+								            text = rawdata.decode(encoding)
-												use pb for config

											
										
										
											2024-04-17 14:02:25 +08:00
+								        result = extractor.extract(text)
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								    except Exception as e:
-												use pb for config

											
										
										
											2024-04-17 14:02:25 +08:00
+								        logger.warning(f"cannot get content from {url}\n{e}")
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								        return -7, {}
-												more strictly crawler filter

											
										
										
											2024-04-08 17:58:29 +08:00
 								    if not result:
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								        logger.error(f"gne cannot extract {url}")
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								        return 0, {}
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								    if len(result['title']) < 4 or len(result['content']) < 24:
 								        logger.info(f"{result} not valid")
-												more strictly crawler filter

											
										
										
											2024-04-08 17:58:29 +08:00
+								        return 0, {}
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
 								            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
 								        logger.warning(f"can not get {url} from the Internet")
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								        return -7, {}
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								    date_str = extract_and_convert_dates(result['publish_time'])
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								    if date_str:
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								        result['publish_time'] = date_str
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								    else:
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
 								    soup = BeautifulSoup(text, "html.parser")
 								    try:
 								        meta_description = soup.find("meta", {"name": "description"})
 								        if meta_description:
 								            result['abstract'] = meta_description["content"]
 								        else:
 								            result['abstract'] = ''
-												improve the crawler

											
										
										
											2024-04-09 11:38:51 +08:00
+								    except Exception:
-												initial commit

											
										
										
											2024-04-07 09:37:47 +08:00
+								        result['abstract'] = ''
 								    result['url'] = str(url)
 								    return 11, result