wiseflow/client/backend/scrapers/simple_crawler.py

101 lines
3.2 KiB
Python
Raw Normal View History

2024-04-07 09:37:47 +08:00
from gne import GeneralNewsExtractor
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import re
2024-04-08 17:58:29 +08:00
import chardet
2024-04-07 09:37:47 +08:00
extractor = GeneralNewsExtractor()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def simple_crawler(url: str | Path, logger=None) -> (int, dict):
"""
返回文章信息dict和flag负数为报错0为没有结果11为成功
"""
try:
response = requests.get(url, header, timeout=60)
except:
if logger:
logger.error(f"cannot connect {url}")
else:
print(f"cannot connect {url}")
return -7, {}
if response.status_code != 200:
if logger:
logger.error(f"cannot connect {url}")
else:
print(f"cannot connect {url}")
return -7, {}
2024-04-08 17:58:29 +08:00
rawdata = response.content
encoding = chardet.detect(rawdata)['encoding']
if encoding is not None and encoding.lower() == 'utf-8':
try:
text = rawdata.decode(encoding)
except:
if logger:
logger.error(f"{url} decode error, aborting")
else:
print(f"{url} decode error, aborting")
return 0, {}
else:
if logger:
logger.error(f"{url} undetected coding, aborting")
else:
print(f"{url} undetected coding, aborting")
return 0, {}
2024-04-07 09:37:47 +08:00
result = extractor.extract(text)
2024-04-08 17:58:29 +08:00
if not result:
2024-04-07 09:37:47 +08:00
if logger:
logger.error(f"gne cannot extract {url}")
else:
print(f"gne cannot extract {url}")
return 0, {}
2024-04-08 17:58:29 +08:00
if len(result['title']) < 5 or len(result['content']) < 24:
if logger:
logger.warning(f"{result} not valid")
else:
print(f"{result} not valid")
return 0, {}
2024-04-07 09:37:47 +08:00
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
if logger:
logger.warning(f"can not get {url} from the Internet")
else:
print(f"can not get {url} from the Internet")
return -7, {}
date_str = re.findall(r"\d{4}-\d{2}-\d{2}", result['publish_time'])
if date_str:
2024-04-07 21:40:26 +08:00
result['publish_time'] = date_str[0].replace("-", "")
2024-04-07 09:37:47 +08:00
else:
2024-04-08 17:58:29 +08:00
date_str = re.findall(r"\d{4}\.\d{2}\.\d{2}", result['publish_time'])
2024-04-07 09:37:47 +08:00
if date_str:
2024-04-08 17:58:29 +08:00
result['publish_time'] = date_str[0].replace(".", "")
2024-04-07 09:37:47 +08:00
else:
2024-04-08 17:58:29 +08:00
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
if date_str:
result['publish_time'] = date_str[0]
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
2024-04-07 09:37:47 +08:00
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = meta_description["content"]
else:
result['abstract'] = ''
except:
result['abstract'] = ''
result['url'] = str(url)
return 11, result