mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-03 02:54:37 +08:00
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
from gne import GeneralNewsExtractor
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
import re
|
||
|
||
|
||
extractor = GeneralNewsExtractor()
|
||
header = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||
|
||
|
||
def simple_crawler(url: str | Path, logger=None) -> (int, dict):
|
||
"""
|
||
返回文章信息dict和flag,负数为报错,0为没有结果,11为成功
|
||
"""
|
||
try:
|
||
response = requests.get(url, header, timeout=60)
|
||
except:
|
||
if logger:
|
||
logger.error(f"cannot connect {url}")
|
||
else:
|
||
print(f"cannot connect {url}")
|
||
return -7, {}
|
||
|
||
if response.status_code != 200:
|
||
if logger:
|
||
logger.error(f"cannot connect {url}")
|
||
else:
|
||
print(f"cannot connect {url}")
|
||
return -7, {}
|
||
|
||
text = response.text
|
||
result = extractor.extract(text)
|
||
if not result or not result['title'] or not result['content']:
|
||
if logger:
|
||
logger.error(f"gne cannot extract {url}")
|
||
else:
|
||
print(f"gne cannot extract {url}")
|
||
return 0, {}
|
||
|
||
if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result['title'].startswith('403'):
|
||
if logger:
|
||
logger.warning(f"can not get {url} from the Internet")
|
||
else:
|
||
print(f"can not get {url} from the Internet")
|
||
return -7, {}
|
||
|
||
date_str = re.findall(r"\d{4}-\d{2}-\d{2}", result['publish_time'])
|
||
if date_str:
|
||
result['publish_time'] = date_str[0].replace("-", "")
|
||
else:
|
||
date_str = re.findall(r"\d{4}\d{2}\d{2}", result['publish_time'])
|
||
if date_str:
|
||
result['publish_time'] = date_str[0]
|
||
else:
|
||
result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d")
|
||
|
||
soup = BeautifulSoup(text, "html.parser")
|
||
try:
|
||
meta_description = soup.find("meta", {"name": "description"})
|
||
if meta_description:
|
||
result['abstract'] = meta_description["content"]
|
||
else:
|
||
result['abstract'] = ''
|
||
except:
|
||
result['abstract'] = ''
|
||
|
||
result['url'] = str(url)
|
||
return 11, result
|