wiseflow/dashboard/mp_crawler.py

import httpx
from bs4 import BeautifulSoup
from datetime import datetime
import re


header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


def mp_crawler(url: str, logger) -> (int, dict):
    if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
        logger.warning(f'{url} is not a mp url, you should not use this function')
        return -5, {}

    url = url.replace("http://", "https://", 1)

    try:
        with httpx.Client() as client:
            response = client.get(url, headers=header, timeout=30)
    except Exception as e:
        logger.warning(f"cannot get content from {url}\n{e}")
        return -7, {}

    soup = BeautifulSoup(response.text, 'html.parser')

    # Get the original release date first
    pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
    match = re.search(pattern, response.text)

    if match:
        date_only = match.group(1)
        publish_time = date_only.replace('-', '')
    else:
        publish_time = datetime.strftime(datetime.today(), "%Y%m%d")

    # Get description content from < meta > tag
    try:
        meta_description = soup.find('meta', attrs={'name': 'description'})
        summary = meta_description['content'].strip() if meta_description else ''
        card_info = soup.find('div', id='img-content')
        # Parse the required content from the < div > tag
        rich_media_title = soup.find('h1', id='activity-name').text.strip() \
            if soup.find('h1', id='activity-name') \
            else soup.find('h1', class_='rich_media_title').text.strip()
        profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
            if card_info \
            else soup.find('div', class_='wx_follow_nickname').text.strip()
    except Exception as e:
        logger.warning(f"not mp format: {url}\n{e}")
        return -7, {}

    if not rich_media_title or not profile_nickname:
        logger.warning(f"failed to analysis {url}, no title or profile_nickname")
        # For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
        return -7, {}

    # Parse text and image links within the content interval
    # Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
    #  because the structure of this part is completely different, and a separate analysis scheme needs to be written
    #  (but the proportion of this type of article is not high).
    texts = []
    images = set()
    content_area = soup.find('div', id='js_content')
    if content_area:
        # 提取文本
        for section in content_area.find_all(['section', 'p'], recursive=False):  # 遍历顶级section
            text = section.get_text(separator=' ', strip=True)
            if text and text not in texts:
                texts.append(text)

        for img in content_area.find_all('img', class_='rich_pages wxw-img'):
            img_src = img.get('data-src') or img.get('src')
            if img_src:
                images.add(img_src)
        cleaned_texts = [t for t in texts if t.strip()]
        content = '\n'.join(cleaned_texts)
    else:
        logger.warning(f"failed to analysis contents {url}")
        return 0, {}
    if content:
        content = f"({profile_nickname} 文章){content}"
    else:
        # If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
        # At this time, you can use the summary as the content.
        content = f"({profile_nickname} 文章){summary}"

    # Get links to images in meta property = "og: image" and meta property = "twitter: image"
    og_image = soup.find('meta', property='og:image')
    twitter_image = soup.find('meta', property='twitter:image')
    if og_image:
        images.add(og_image['content'])
    if twitter_image:
        images.add(twitter_image['content'])

    if rich_media_title == summary or not summary:
        abstract = ''
    else:
        abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"

    return 11, {
        'title': rich_media_title,
        'author': profile_nickname,
        'publish_time': publish_time,
        'abstract': abstract,
        'content': content,
        'images': list(images),
        'url': url,
    }