wiseflow/test/read_markdown.py

60 lines
2.5 KiB
Python
Raw Normal View History

2025-01-12 16:22:37 +08:00
import os
import json
import re
def read_markdown_from_json_files(directory_path):
# Get all JSON files in the directory
json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
img_pattern = r'!\[(.*?)\]\((.*?)\)'
link_pattern = r'\[(.*?)\]\((.*?)\)'
# Process each JSON file
for json_file in sorted(json_files):
file_path = os.path.join(directory_path, json_file)
print(f"\nProcessing file: {json_file}")
print("-" * 50)
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
markdown = data.get('markdown')
# Find the longest consecutive newlines in the markdown text
if markdown:
# Find all sequences of newlines and get their lengths
max_newlines = max(len(match) for match in re.findall(r'\n+', markdown)) if re.search(r'\n+', markdown) else 0
print(f"Longest consecutive newlines: {max_newlines}")
if max_newlines < 2:
sections = [markdown]
else:
sections = markdown.split('\n' * max_newlines)
for i, section in enumerate(sections):
print(f"Section {i + 1}:")
print(section)
print('\n\n')
newline_count = section.count('\n')
# 处理图片标记 ![alt](src)
img_pattern = r'!\[(.*?)\]\((.*?)\)'
matches = re.findall(img_pattern, section)
for alt, src in matches:
# 替换为新格式 §alt||src§
section = section.replace(f'![{alt}]({src})', f'§{alt}||{src}§')
# 处理链接标记 [text](url)
matches = re.findall(link_pattern, section)
# 从text中去掉所有matches部分
for link_text, link_url in matches:
section = section.replace(f'[{link_text}]({link_url})', '')
if len(section) == 0:
print("no text in section")
continue
print(f"newline/text ratio: {newline_count/len(section)*100}")
print(f"links/section ratio: {len(matches)/len(section)*100}")
print("-" * 50)
if __name__ == "__main__":
# Path to the webpage_samples directory
samples_dir = os.path.dirname(os.path.abspath(__file__)) + "/webpage_samples"
read_markdown_from_json_files(samples_dir)