2024-04-07 09:37:47 +08:00
|
|
|
|
from gne import GeneralNewsExtractor
|
2024-04-09 11:38:51 +08:00
|
|
|
|
import httpx
|
2024-04-07 09:37:47 +08:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from pathlib import Path
|
2024-04-09 11:38:51 +08:00
|
|
|
|
from general_utils import extract_and_convert_dates
|
2024-04-08 17:58:29 +08:00
|
|
|
|
import chardet
|
2024-04-09 11:38:51 +08:00
|
|
|
|
from get_logger import get_logger
|
|
|
|
|
import os
|
2024-04-07 09:37:47 +08:00
|
|
|
|
|
|
|
|
|
extractor = GeneralNewsExtractor()
|
|
|
|
|
header = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
|
project_dir = os.environ.get("PROJECT_DIR", "")
|
2024-04-17 14:02:25 +08:00
|
|
|
|
os.makedirs(project_dir, exist_ok=True)
|
2024-04-09 11:38:51 +08:00
|
|
|
|
logger = get_logger(name='simple_crawler', file=os.path.join(project_dir, f'simple_crawler.log'))
|
|
|
|
|
|
2024-04-07 09:37:47 +08:00
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
|
def simple_crawler(url: str | Path) -> (int, dict):
|
2024-04-07 09:37:47 +08:00
|
|
|
|
"""
|
|
|
|
|
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
|
|
|
|
"""
|
|
|
|
|
try:
|
2024-04-09 11:38:51 +08:00
|
|
|
|
with httpx.Client() as client:
|
|
|
|
|
response = client.get(url, headers=header, timeout=30)
|
|
|
|
|
rawdata = response.content
|
|
|
|
|
encoding = chardet.detect(rawdata)['encoding']
|
2024-04-08 17:58:29 +08:00
|
|
|
|
text = rawdata.decode(encoding)
|
2024-04-17 14:02:25 +08:00
|
|
|
|
result = extractor.extract(text)
|
2024-04-09 11:38:51 +08:00
|
|
|
|
except Exception as e:
|
2024-04-17 14:02:25 +08:00
|
|
|
|
logger.warning(f"cannot get content from {url}\n{e}")
|
2024-04-09 11:38:51 +08:00
|
|
|
|
return -7, {}
|
2024-04-08 17:58:29 +08:00
|
|
|
|
|
|
|
|
|
if not result:
|
2024-04-09 11:38:51 +08:00
|
|
|
|
logger.error(f"gne cannot extract {url}")
|
2024-04-07 09:37:47 +08:00
|
|
|
|
return 0, {}
|
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
|
if len(result['title']) < 4 or len(result['content']) < 24:
|
|
|
|
|
logger.info(f"{result} not valid")
|
2024-04-08 17:58:29 +08:00
|
|
|
|
return 0, {}
|
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
|
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403')\
|
|
|
|
|
or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'):
|
|
|
|
|
logger.warning(f"can not get {url} from the Internet")
|
2024-04-07 09:37:47 +08:00
|
|
|
|
return -7, {}
|
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
|
date_str = extract_and_convert_dates(result['publish_time'])
|
2024-04-07 09:37:47 +08:00
|
|
|
|
if date_str:
|
2024-04-09 11:38:51 +08:00
|
|
|
|
result['publish_time'] = date_str
|
2024-04-07 09:37:47 +08:00
|
|
|
|
else:
|
2024-04-09 11:38:51 +08:00
|
|
|
|
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
2024-04-07 09:37:47 +08:00
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(text, "html.parser")
|
|
|
|
|
try:
|
|
|
|
|
meta_description = soup.find("meta", {"name": "description"})
|
|
|
|
|
if meta_description:
|
|
|
|
|
result['abstract'] = meta_description["content"]
|
|
|
|
|
else:
|
|
|
|
|
result['abstract'] = ''
|
2024-04-09 11:38:51 +08:00
|
|
|
|
except Exception:
|
2024-04-07 09:37:47 +08:00
|
|
|
|
result['abstract'] = ''
|
|
|
|
|
|
|
|
|
|
result['url'] = str(url)
|
|
|
|
|
return 11, result
|