wiseflow/test/read_markdown.py

import os
import json
import re

def read_markdown_from_json_files(directory_path):
    # Get all JSON files in the directory
    json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
    img_pattern = r'!\[(.*?)\]\((.*?)\)'
    link_pattern = r'\[(.*?)\]\((.*?)\)'
    
    # Process each JSON file
    for json_file in sorted(json_files):
        file_path = os.path.join(directory_path, json_file)
        print(f"\nProcessing file: {json_file}")
        print("-" * 50)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
                
        markdown = data.get('markdown')
        # Find the longest consecutive newlines in the markdown text
        if markdown:
            # Find all sequences of newlines and get their lengths
            max_newlines = max(len(match) for match in re.findall(r'\n+', markdown)) if re.search(r'\n+', markdown) else 0
            print(f"Longest consecutive newlines: {max_newlines}")
            if max_newlines < 2:
                sections = [markdown]
            else:
                sections = markdown.split('\n' * max_newlines)

            for i, section in enumerate(sections):
                print(f"Section {i + 1}:")
                print(section)
                print('\n\n')
                newline_count = section.count('\n')
                # 处理图片标记 ![alt](src)
                img_pattern = r'!\[(.*?)\]\((.*?)\)'
                matches = re.findall(img_pattern, section)
                for alt, src in matches:
                # 替换为新格式 §alt||src§
                    section = section.replace(f'![{alt}]({src})', f'§{alt}||{src}§')
                # 处理链接标记 [text](url)
                matches = re.findall(link_pattern, section)
                # 从text中去掉所有matches部分
                for link_text, link_url in matches:
                    section = section.replace(f'[{link_text}]({link_url})', '')
                
                if len(section) == 0:
                    print("no text in section")
                    continue
                print(f"newline/text ratio: {newline_count/len(section)*100}")
                print(f"links/section ratio: {len(matches)/len(section)*100}")
                print("-" * 50)


if __name__ == "__main__":
    # Path to the webpage_samples directory
    samples_dir = os.path.dirname(os.path.abspath(__file__)) + "/webpage_samples"
    read_markdown_from_json_files(samples_dir)
modify scrapers 2025-01-12 16:22:37 +08:00			`import os`
			`import json`
			`import re`

			`def read_markdown_from_json_files(directory_path):`
			`# Get all JSON files in the directory`
			`json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]`
			`img_pattern = r'!\[(.?)\]\((.?)\)'`
			`link_pattern = r'\[(.?)\]\((.?)\)'`

			`# Process each JSON file`
			`for json_file in sorted(json_files):`
			`file_path = os.path.join(directory_path, json_file)`
			`print(f"\nProcessing file: {json_file}")`
			`print("-" * 50)`

			`with open(file_path, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`

			`markdown = data.get('markdown')`
			`# Find the longest consecutive newlines in the markdown text`
			`if markdown:`
			`# Find all sequences of newlines and get their lengths`
			`max_newlines = max(len(match) for match in re.findall(r'\n+', markdown)) if re.search(r'\n+', markdown) else 0`
			`print(f"Longest consecutive newlines: {max_newlines}")`
			`if max_newlines < 2:`
			`sections = [markdown]`
			`else:`
			`sections = markdown.split('\n' * max_newlines)`

			`for i, section in enumerate(sections):`
			`print(f"Section {i + 1}:")`
			`print(section)`
			`print('\n\n')`
			`newline_count = section.count('\n')`
			`# 处理图片标记 ![alt](src)`
			`img_pattern = r'!\[(.?)\]\((.?)\)'`
			`matches = re.findall(img_pattern, section)`
			`for alt, src in matches:`
			`# 替换为新格式 §alt\|\|src§`
			`section = section.replace(f'![{alt}]({src})', f'§{alt}\|\|{src}§')`
			`# 处理链接标记 [text](url)`
			`matches = re.findall(link_pattern, section)`
			`# 从text中去掉所有matches部分`
			`for link_text, link_url in matches:`
			`section = section.replace(f'[{link_text}]({link_url})', '')`

			`if len(section) == 0:`
			`print("no text in section")`
			`continue`
			`print(f"newline/text ratio: {newline_count/len(section)*100}")`
			`print(f"links/section ratio: {len(matches)/len(section)*100}")`
			`print("-" * 50)`


			`if __name__ == "__main__":`
			`# Path to the webpage_samples directory`
			`samples_dir = os.path.dirname(os.path.abspath(__file__)) + "/webpage_samples"`
			`read_markdown_from_json_files(samples_dir)`