mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
adapt to new version crawl4ai
This commit is contained in:
parent
7c880e384a
commit
d99cb4fef9
@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
@ -8,14 +9,30 @@ import json
|
|||||||
|
|
||||||
save_dir = 'webpage_samples'
|
save_dir = 'webpage_samples'
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
options={
|
||||||
|
"skip_internal_links": True,
|
||||||
|
"escape_html": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler_config = CrawlerRunConfig(delay_before_return_html=2.0, markdown_generator=md_generator,
|
||||||
|
wait_until='commit', magic=True, scan_full_page=True,
|
||||||
|
cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
|
||||||
async def main(sites: list):
|
async def main(sites: list):
|
||||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||||
for site in sites:
|
for site in sites:
|
||||||
result = await crawler.arun(url=site, delay_before_return_html=2.0, exclude_social_media_links=True,
|
result = await crawler.arun(url=site, config=crawler_config)
|
||||||
magic=True,
|
if not result or not result.success:
|
||||||
scan_full_page=True,
|
print(f'{site} failed to crawl, skip')
|
||||||
remove_overlay_elements=True, cache_mode=CacheMode.BYPASS)
|
continue
|
||||||
|
|
||||||
|
print('raw_markdown:')
|
||||||
|
print(result.markdown_v2.raw_markdown)
|
||||||
|
print('-' * 24)
|
||||||
|
|
||||||
record_file = os.path.join(save_dir, f"{hashlib.sha256(site.encode()).hexdigest()[-6:]}.json")
|
record_file = os.path.join(save_dir, f"{hashlib.sha256(site.encode()).hexdigest()[-6:]}.json")
|
||||||
with open(record_file, 'w', encoding='utf-8') as f:
|
with open(record_file, 'w', encoding='utf-8') as f:
|
||||||
json.dump(result.model_dump(), f, indent=4, ensure_ascii=False)
|
json.dump(result.model_dump(), f, indent=4, ensure_ascii=False)
|
||||||
|
Loading…
Reference in New Issue
Block a user