wiseflow/dashboard/simple_crawler.py

61 lines
2.1 KiB
Python
Raw Normal View History

2024-04-07 09:37:47 +08:00
from gne import GeneralNewsExtractor
2024-04-09 11:38:51 +08:00
import httpx
2024-04-07 09:37:47 +08:00
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
2024-06-13 21:08:58 +08:00
from utils.general_utils import extract_and_convert_dates
2024-04-08 17:58:29 +08:00
import chardet
2024-04-29 23:06:17 +08:00
2024-04-07 09:37:47 +08:00
extractor = GeneralNewsExtractor()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
2024-04-29 23:06:17 +08:00
def simple_crawler(url: str | Path, logger) -> (int, dict):
2024-04-07 09:37:47 +08:00
"""
2024-06-13 21:08:58 +08:00
Return article information dict and flag, negative number is error, 0 is no result, 11 is success
2024-04-07 09:37:47 +08:00
"""
try:
2024-04-09 11:38:51 +08:00
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
2024-04-08 17:58:29 +08:00
text = rawdata.decode(encoding)
2024-04-17 14:02:25 +08:00
result = extractor.extract(text)
2024-04-09 11:38:51 +08:00
except Exception as e:
2024-04-17 14:02:25 +08:00
logger.warning(f"cannot get content from {url}\n{e}")
2024-04-09 11:38:51 +08:00
return -7, {}
2024-04-08 17:58:29 +08:00
if not result:
2024-04-09 11:38:51 +08:00
logger.error(f"gne cannot extract {url}")
2024-04-07 09:37:47 +08:00
return 0, {}
2024-04-09 11:38:51 +08:00
if len(result['title']) < 4 or len(result['content']) < 24:
logger.info(f"{result} not valid")
2024-04-08 17:58:29 +08:00
return 0, {}
2024-04-09 11:38:51 +08:00
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
logger.warning(f"can not get {url} from the Internet")
2024-04-07 09:37:47 +08:00
return -7, {}
2024-04-09 11:38:51 +08:00
date_str = extract_and_convert_dates(result['publish_time'])
2024-04-07 09:37:47 +08:00
if date_str:
2024-04-09 11:38:51 +08:00
result['publish_time'] = date_str
2024-04-07 09:37:47 +08:00
else:
2024-04-09 11:38:51 +08:00
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
2024-04-07 09:37:47 +08:00
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
2024-04-17 18:28:10 +08:00
result['abstract'] = meta_description["content"].strip()
2024-04-07 09:37:47 +08:00
else:
result['abstract'] = ''
2024-04-09 11:38:51 +08:00
except Exception:
2024-04-07 09:37:47 +08:00
result['abstract'] = ''
result['url'] = str(url)
return 11, result