wiseflow/client/backend/scrapers/simple_crawler.py
2024-04-07 21:40:26 +08:00

72 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from gne import GeneralNewsExtractor
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import re
extractor = GeneralNewsExtractor()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def simple_crawler(url: str | Path, logger=None) -> (int, dict):
"""
返回文章信息dict和flag负数为报错0为没有结果11为成功
"""
try:
response = requests.get(url, header, timeout=60)
except:
if logger:
logger.error(f"cannot connect {url}")
else:
print(f"cannot connect {url}")
return -7, {}
if response.status_code != 200:
if logger:
logger.error(f"cannot connect {url}")
else:
print(f"cannot connect {url}")
return -7, {}
text = response.text
result = extractor.extract(text)
if not result or not result['title'] or not result['content']:
if logger:
logger.error(f"gne cannot extract {url}")
else:
print(f"gne cannot extract {url}")
return 0, {}
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
if logger:
logger.warning(f"can not get {url} from the Internet")
else:
print(f"can not get {url} from the Internet")
return -7, {}
date_str = re.findall(r"\d{4}-\d{2}-\d{2}", result['publish_time'])
if date_str:
result['publish_time'] = date_str[0].replace("-", "")
else:
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
if date_str:
result['publish_time'] = date_str[0]
else:
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
soup = BeautifulSoup(text, "html.parser")
try:
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
result['abstract'] = meta_description["content"]
else:
result['abstract'] = ''
except:
result['abstract'] = ''
result['url'] = str(url)
return 11, result