diff --git a/test/craw4ai_fetching.py b/test/craw4ai_fetching.py index d266eba..1740315 100644 --- a/test/craw4ai_fetching.py +++ b/test/craw4ai_fetching.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator import os import hashlib import json @@ -8,14 +9,30 @@ import json save_dir = 'webpage_samples' +md_generator = DefaultMarkdownGenerator( + options={ + "skip_internal_links": True, + "escape_html": True + } + ) + +crawler_config = CrawlerRunConfig(delay_before_return_html=2.0, markdown_generator=md_generator, + wait_until='commit', magic=True, scan_full_page=True, + cache_mode=CacheMode.BYPASS) + + async def main(sites: list): async with AsyncWebCrawler(headless=True, verbose=True) as crawler: for site in sites: - result = await crawler.arun(url=site, delay_before_return_html=2.0, exclude_social_media_links=True, - magic=True, - scan_full_page=True, - remove_overlay_elements=True, cache_mode=CacheMode.BYPASS) + result = await crawler.arun(url=site, config=crawler_config) + if not result or not result.success: + print(f'{site} failed to crawl, skip') + continue + print('raw_markdown:') + print(result.markdown_v2.raw_markdown) + print('-' * 24) + record_file = os.path.join(save_dir, f"{hashlib.sha256(site.encode()).hexdigest()[-6:]}.json") with open(record_file, 'w', encoding='utf-8') as f: json.dump(result.model_dump(), f, indent=4, ensure_ascii=False)