From c309cf7afef93ce95dff591f651b52bebe98cf78 Mon Sep 17 00:00:00 2001 From: madizm Date: Mon, 2 Sep 2024 10:03:14 +0800 Subject: [PATCH] =?UTF-8?q?feat=20=E8=A7=A3=E6=9E=90=E5=BE=AE=E4=BF=A1?= =?UTF-8?q?=E6=96=87=E7=AB=A0=E7=9B=AE=E5=BD=95=20(#55)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat 解析微信文章目录 * fix mp_crawler should return https url --- core/scrapers/mp_crawler.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/scrapers/mp_crawler.py b/core/scrapers/mp_crawler.py index e8a0d13..c4b5b6c 100644 --- a/core/scrapers/mp_crawler.py +++ b/core/scrapers/mp_crawler.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from typing import Union import httpx from bs4 import BeautifulSoup from datetime import datetime @@ -11,7 +12,7 @@ header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'} -async def mp_crawler(url: str, logger) -> (int, dict): +async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]: if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'): logger.warning(f'{url} is not a mp url, you should not use this function') return -5, {} @@ -34,6 +35,11 @@ async def mp_crawler(url: str, logger) -> (int, dict): soup = BeautifulSoup(response.text, 'html.parser') + if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'): + # 文章目录 + urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')} + return 1, set(urls) + # Get the original release date first pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'" match = re.search(pattern, response.text)