import os import json import re def read_markdown_from_json_files(directory_path): # Get all JSON files in the directory json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')] img_pattern = r'!\[(.*?)\]\((.*?)\)' link_pattern = r'\[(.*?)\]\((.*?)\)' # Process each JSON file for json_file in sorted(json_files): file_path = os.path.join(directory_path, json_file) print(f"\nProcessing file: {json_file}") print("-" * 50) with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) markdown = data.get('markdown') # Find the longest consecutive newlines in the markdown text if markdown: # Find all sequences of newlines and get their lengths max_newlines = max(len(match) for match in re.findall(r'\n+', markdown)) if re.search(r'\n+', markdown) else 0 print(f"Longest consecutive newlines: {max_newlines}") if max_newlines < 2: sections = [markdown] else: sections = markdown.split('\n' * max_newlines) for i, section in enumerate(sections): print(f"Section {i + 1}:") print(section) print('\n\n') newline_count = section.count('\n') # 处理图片标记 ![alt](src) img_pattern = r'!\[(.*?)\]\((.*?)\)' matches = re.findall(img_pattern, section) for alt, src in matches: # 替换为新格式 §alt||src§ section = section.replace(f'![{alt}]({src})', f'§{alt}||{src}§') # 处理链接标记 [text](url) matches = re.findall(link_pattern, section) # 从text中去掉所有matches部分 for link_text, link_url in matches: section = section.replace(f'[{link_text}]({link_url})', '') if len(section) == 0: print("no text in section") continue print(f"newline/text ratio: {newline_count/len(section)*100}") print(f"links/section ratio: {len(matches)/len(section)*100}") print("-" * 50) if __name__ == "__main__": # Path to the webpage_samples directory samples_dir = os.path.dirname(os.path.abspath(__file__)) + "/webpage_samples" read_markdown_from_json_files(samples_dir)