wiseflow/dashboard/simple_crawler.py

from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from utils.general_utils import extract_and_convert_dates
import chardet


extractor = GeneralNewsExtractor()
header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


def simple_crawler(url: str | Path, logger) -> (int, dict):
    """
    Return article information dict and flag, negative number is error, 0 is no result, 11 is success
    """
    try:
        with httpx.Client() as client:
            response = client.get(url, headers=header, timeout=30)
            rawdata = response.content
            encoding = chardet.detect(rawdata)['encoding']
            text = rawdata.decode(encoding)
        result = extractor.extract(text)
    except Exception as e:
        logger.warning(f"cannot get content from {url}\n{e}")
        return -7, {}

    if not result:
        logger.error(f"gne cannot extract {url}")
        return 0, {}

    if len(result['title']) < 4 or len(result['content']) < 24:
        logger.info(f"{result} not valid")
        return 0, {}

    if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
            or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
        logger.warning(f"can not get {url} from the Internet")
        return -7, {}

    date_str = extract_and_convert_dates(result['publish_time'])
    if date_str:
        result['publish_time'] = date_str
    else:
        result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")

    soup = BeautifulSoup(text, "html.parser")
    try:
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description:
            result['abstract'] = meta_description["content"].strip()
        else:
            result['abstract'] = ''
    except Exception:
        result['abstract'] = ''

    result['url'] = str(url)
    return 11, result
initial commit 2024-04-07 09:37:47 +08:00			`from gne import GeneralNewsExtractor`
improve the crawler 2024-04-09 11:38:51 +08:00			`import httpx`
initial commit 2024-04-07 09:37:47 +08:00			`from bs4 import BeautifulSoup`
			`from datetime import datetime`
			`from pathlib import Path`
web dashboard 2024-06-13 21:08:58 +08:00			`from utils.general_utils import extract_and_convert_dates`
more strictly crawler filter 2024-04-08 17:58:29 +08:00			`import chardet`
use new logger 2024-04-29 23:06:17 +08:00
initial commit 2024-04-07 09:37:47 +08:00
			`extractor = GeneralNewsExtractor()`
			`header = {`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}`


use new logger 2024-04-29 23:06:17 +08:00			`def simple_crawler(url: str \| Path, logger) -> (int, dict):`
initial commit 2024-04-07 09:37:47 +08:00			`"""`
web dashboard 2024-06-13 21:08:58 +08:00			`Return article information dict and flag, negative number is error, 0 is no result, 11 is success`
initial commit 2024-04-07 09:37:47 +08:00			`"""`
			`try:`
improve the crawler 2024-04-09 11:38:51 +08:00			`with httpx.Client() as client:`
			`response = client.get(url, headers=header, timeout=30)`
			`rawdata = response.content`
			`encoding = chardet.detect(rawdata)['encoding']`
more strictly crawler filter 2024-04-08 17:58:29 +08:00			`text = rawdata.decode(encoding)`
use pb for config 2024-04-17 14:02:25 +08:00			`result = extractor.extract(text)`
improve the crawler 2024-04-09 11:38:51 +08:00			`except Exception as e:`
use pb for config 2024-04-17 14:02:25 +08:00			`logger.warning(f"cannot get content from {url}\n{e}")`
improve the crawler 2024-04-09 11:38:51 +08:00			`return -7, {}`
more strictly crawler filter 2024-04-08 17:58:29 +08:00
			`if not result:`
improve the crawler 2024-04-09 11:38:51 +08:00			`logger.error(f"gne cannot extract {url}")`
initial commit 2024-04-07 09:37:47 +08:00			`return 0, {}`

improve the crawler 2024-04-09 11:38:51 +08:00			`if len(result['title']) < 4 or len(result['content']) < 24:`
			`logger.info(f"{result} not valid")`
more strictly crawler filter 2024-04-08 17:58:29 +08:00			`return 0, {}`

improve the crawler 2024-04-09 11:38:51 +08:00			`if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\`
			`or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):`
			`logger.warning(f"can not get {url} from the Internet")`
initial commit 2024-04-07 09:37:47 +08:00			`return -7, {}`

improve the crawler 2024-04-09 11:38:51 +08:00			`date_str = extract_and_convert_dates(result['publish_time'])`
initial commit 2024-04-07 09:37:47 +08:00			`if date_str:`
improve the crawler 2024-04-09 11:38:51 +08:00			`result['publish_time'] = date_str`
initial commit 2024-04-07 09:37:47 +08:00			`else:`
improve the crawler 2024-04-09 11:38:51 +08:00			`result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")`
initial commit 2024-04-07 09:37:47 +08:00
			`soup = BeautifulSoup(text, "html.parser")`
			`try:`
			`meta_description = soup.find("meta", {"name": "description"})`
			`if meta_description:`
improve sth 2024-04-17 18:28:10 +08:00			`result['abstract'] = meta_description["content"].strip()`
initial commit 2024-04-07 09:37:47 +08:00			`else:`
			`result['abstract'] = ''`
improve the crawler 2024-04-09 11:38:51 +08:00			`except Exception:`
initial commit 2024-04-07 09:37:47 +08:00			`result['abstract'] = ''`

			`result['url'] = str(url)`
			`return 11, result`