mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
55 lines
2.0 KiB
Python
55 lines
2.0 KiB
Python
import os
|
|
import json
|
|
import re
|
|
import sys
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.dirname(current_dir) # 获取父目录
|
|
sys.path.append(project_root)
|
|
|
|
from core.scrapers.mp_scraper import mp_scraper
|
|
|
|
def read_markdown_from_json_files(directory_path):
|
|
# Get all JSON files in the directory
|
|
json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
|
|
img_pattern = r'!\[(.*?)\]\((.*?)\)'
|
|
link_pattern = r'\[(.*?)\]\((.*?)\)'
|
|
|
|
# Process each JSON file
|
|
for json_file in sorted(json_files):
|
|
file_path = os.path.join(directory_path, json_file)
|
|
print(f"\nProcessing file: {json_file}")
|
|
print("-" * 50)
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
url = data.get('url')
|
|
if url.startswith('https://mp.weixin.qq.com'):
|
|
result = mp_scraper(data)
|
|
markdown = result.content
|
|
else:
|
|
markdown = data.get('markdown')
|
|
|
|
# Find the longest consecutive newlines in the markdown text
|
|
if markdown:
|
|
# 处理图片标记 ![alt](src)
|
|
matches = re.findall(img_pattern, markdown)
|
|
for alt, src in matches:
|
|
# 替换为新格式 §alt||img_12§
|
|
markdown = markdown.replace(f'![{alt}]({src})', f'<img>')
|
|
matches = re.findall(link_pattern, markdown)
|
|
for link_text, link_url in matches:
|
|
markdown = markdown.replace(f'[{link_text}]({link_url})', '[url]')
|
|
markdown = [m.strip() for m in markdown.split('# ') if m.strip()]
|
|
markdown = '\n----------------------------------\n'.join(markdown)
|
|
|
|
record_file = open(f'{json_file}.txt', 'w', encoding='utf-8')
|
|
record_file.write(markdown)
|
|
record_file.close()
|
|
|
|
if __name__ == "__main__":
|
|
# Path to the webpage_samples directory
|
|
samples_dir = os.path.dirname(os.path.abspath(__file__)) + "/webpage_samples"
|
|
read_markdown_from_json_files(samples_dir)
|