mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-02-02 18:28:46 +08:00
feat 解析微信文章目录 (#55)
* feat 解析微信文章目录 * fix mp_crawler should return https url
This commit is contained in:
parent
bdc3cbf85e
commit
c309cf7afe
@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -11,7 +12,7 @@ header = {
|
|||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
||||||
|
|
||||||
|
|
||||||
async def mp_crawler(url: str, logger) -> (int, dict):
|
async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
|
||||||
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
||||||
logger.warning(f'{url} is not a mp url, you should not use this function')
|
logger.warning(f'{url} is not a mp url, you should not use this function')
|
||||||
return -5, {}
|
return -5, {}
|
||||||
@ -34,6 +35,11 @@ async def mp_crawler(url: str, logger) -> (int, dict):
|
|||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
|
||||||
|
# 文章目录
|
||||||
|
urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
|
||||||
|
return 1, set(urls)
|
||||||
|
|
||||||
# Get the original release date first
|
# Get the original release date first
|
||||||
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
||||||
match = re.search(pattern, response.text)
|
match = re.search(pattern, response.text)
|
||||||
|
Loading…
Reference in New Issue
Block a user