wiseflow/dashboard/mp_crawler.py
2024-06-13 21:08:58 +08:00

110 lines
4.4 KiB
Python

import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
def mp_crawler(url: str, logger) -> (int, dict):
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
url = url.replace("http://", "https://", 1)
try:
with httpx.Client() as client:
response = client.get(url, headers=header, timeout=30)
except Exception as e:
logger.warning(f"cannot get content from {url}\n{e}")
return -7, {}
soup = BeautifulSoup(response.text, 'html.parser')
# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
if match:
date_only = match.group(1)
publish_time = date_only.replace('-', '')
else:
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
# Get description content from < meta > tag
try:
meta_description = soup.find('meta', attrs={'name': 'description'})
summary = meta_description['content'].strip() if meta_description else ''
card_info = soup.find('div', id='img-content')
# Parse the required content from the < div > tag
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
if soup.find('h1', id='activity-name') \
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
if card_info \
else soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
logger.warning(f"not mp format: {url}\n{e}")
return -7, {}
if not rich_media_title or not profile_nickname:
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return -7, {}
# Parse text and image links within the content interval
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
# (but the proportion of this type of article is not high).
texts = []
images = set()
content_area = soup.find('div', id='js_content')
if content_area:
# 提取文本
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
text = section.get_text(separator=' ', strip=True)
if text and text not in texts:
texts.append(text)
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
img_src = img.get('data-src') or img.get('src')
if img_src:
images.add(img_src)
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
logger.warning(f"failed to analysis contents {url}")
return 0, {}
if content:
content = f"({profile_nickname} 文章){content}"
else:
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
# At this time, you can use the summary as the content.
content = f"({profile_nickname} 文章){summary}"
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
og_image = soup.find('meta', property='og:image')
twitter_image = soup.find('meta', property='twitter:image')
if og_image:
images.add(og_image['content'])
if twitter_image:
images.add(twitter_image['content'])
if rich_media_title == summary or not summary:
abstract = ''
else:
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
return 11, {
'title': rich_media_title,
'author': profile_nickname,
'publish_time': publish_time,
'abstract': abstract,
'content': content,
'images': list(images),
'url': url,
}