feat 解析微信文章目录 (#55)

* feat 解析微信文章目录 * fix mp_crawler should return https url
2025-01-23 02:20:20 +08:00 · 2024-09-02 10:03:14 +08:00 · 2024-09-02 10:03:14 +08:00 · c309cf7afe
commit c309cf7afe
parent bdc3cbf85e
1 changed files with 7 additions and 1 deletions
--- a/core/scrapers/mp_crawler.py
+++ b/core/scrapers/mp_crawler.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-

+from typing import Union
 import httpx
 from bs4 import BeautifulSoup
 from datetime import datetime
@ -11,7 +12,7 @@ header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


-async def mp_crawler(url: str, logger) -> (int, dict):
+async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
    if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
        logger.warning(f'{url} is not a mp url, you should not use this function')
        return -5, {}
@ -34,6 +35,11 @@ async def mp_crawler(url: str, logger) -> (int, dict):

        soup = BeautifulSoup(response.text, 'html.parser')

+        if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
+            # 文章目录
+            urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
+            return 1, set(urls)
+
        # Get the original release date first
        pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
        match = re.search(pattern, response.text)