mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
110 lines
4.4 KiB
Python
110 lines
4.4 KiB
Python
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
import re
|
|
|
|
|
|
header = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
|
|
|
|
|
|
def mp_crawler(url: str, logger) -> (int, dict):
|
|
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
|
|
logger.warning(f'{url} is not a mp url, you should not use this function')
|
|
return -5, {}
|
|
|
|
url = url.replace("http://", "https://", 1)
|
|
|
|
try:
|
|
with httpx.Client() as client:
|
|
response = client.get(url, headers=header, timeout=30)
|
|
except Exception as e:
|
|
logger.warning(f"cannot get content from {url}\n{e}")
|
|
return -7, {}
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Get the original release date first
|
|
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
|
|
match = re.search(pattern, response.text)
|
|
|
|
if match:
|
|
date_only = match.group(1)
|
|
publish_time = date_only.replace('-', '')
|
|
else:
|
|
publish_time = datetime.strftime(datetime.today(), "%Y%m%d")
|
|
|
|
# Get description content from < meta > tag
|
|
try:
|
|
meta_description = soup.find('meta', attrs={'name': 'description'})
|
|
summary = meta_description['content'].strip() if meta_description else ''
|
|
card_info = soup.find('div', id='img-content')
|
|
# Parse the required content from the < div > tag
|
|
rich_media_title = soup.find('h1', id='activity-name').text.strip() \
|
|
if soup.find('h1', id='activity-name') \
|
|
else soup.find('h1', class_='rich_media_title').text.strip()
|
|
profile_nickname = card_info.find('strong', class_='profile_nickname').text.strip() \
|
|
if card_info \
|
|
else soup.find('div', class_='wx_follow_nickname').text.strip()
|
|
except Exception as e:
|
|
logger.warning(f"not mp format: {url}\n{e}")
|
|
return -7, {}
|
|
|
|
if not rich_media_title or not profile_nickname:
|
|
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
|
|
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
|
|
return -7, {}
|
|
|
|
# Parse text and image links within the content interval
|
|
# Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained,
|
|
# because the structure of this part is completely different, and a separate analysis scheme needs to be written
|
|
# (but the proportion of this type of article is not high).
|
|
texts = []
|
|
images = set()
|
|
content_area = soup.find('div', id='js_content')
|
|
if content_area:
|
|
# 提取文本
|
|
for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section
|
|
text = section.get_text(separator=' ', strip=True)
|
|
if text and text not in texts:
|
|
texts.append(text)
|
|
|
|
for img in content_area.find_all('img', class_='rich_pages wxw-img'):
|
|
img_src = img.get('data-src') or img.get('src')
|
|
if img_src:
|
|
images.add(img_src)
|
|
cleaned_texts = [t for t in texts if t.strip()]
|
|
content = '\n'.join(cleaned_texts)
|
|
else:
|
|
logger.warning(f"failed to analysis contents {url}")
|
|
return 0, {}
|
|
if content:
|
|
content = f"({profile_nickname} 文章){content}"
|
|
else:
|
|
# If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type.
|
|
# At this time, you can use the summary as the content.
|
|
content = f"({profile_nickname} 文章){summary}"
|
|
|
|
# Get links to images in meta property = "og: image" and meta property = "twitter: image"
|
|
og_image = soup.find('meta', property='og:image')
|
|
twitter_image = soup.find('meta', property='twitter:image')
|
|
if og_image:
|
|
images.add(og_image['content'])
|
|
if twitter_image:
|
|
images.add(twitter_image['content'])
|
|
|
|
if rich_media_title == summary or not summary:
|
|
abstract = ''
|
|
else:
|
|
abstract = f"({profile_nickname} 文章){rich_media_title}——{summary}"
|
|
|
|
return 11, {
|
|
'title': rich_media_title,
|
|
'author': profile_nickname,
|
|
'publish_time': publish_time,
|
|
'abstract': abstract,
|
|
'content': content,
|
|
'images': list(images),
|
|
'url': url,
|
|
}
|