update deepscraper for crawl4ai bug

This commit is contained in:
bigbrother666sh 2025-01-03 13:17:24 +08:00
parent b4da3cc853
commit c6d05d0210
2 changed files with 56 additions and 63 deletions

View File

@ -76,8 +76,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
if base_tag and base_tag.get('href'): if base_tag and base_tag.get('href'):
base_url = base_tag['href'] base_url = base_tag['href']
else: else:
# if no base tag, use the current url as base url # 如果没有 base 标签,使用当前页面的 URL 路径作为 base url
base_url = f"{parsed_url.scheme}://{domain}" base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if not base_url.endswith('/'):
# 如果路径不以 / 结尾,则去掉最后一个路径段
base_url = base_url.rsplit('/', 1)[0] + '/'
html = await context.page.inner_html('body') html = await context.page.inner_html('body')
if domain in custom_scrapers: if domain in custom_scrapers:

View File

@ -8,7 +8,7 @@
import os, re import os, re
import json import json
import time import time
from urllib.parse import urlparse, urljoin, quote from urllib.parse import urlparse, urljoin
common_file_exts = [ common_file_exts = [
@ -27,26 +27,46 @@ common_tlds = [
common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#' common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#'
def normalize_url(url: str) -> str: def normalize_url(url: str, base_url: str) -> str:
if url.lower().startswith("www."): url = url.strip().lower()
url = f"https://{url}" if url.startswith(("javascript:", "mailto:", "javacript:", "tel:", "sms:", "data:", "file:", "ftp:", "about:", "chrome:", "blob:", "ws:", "wss:", "view-source:")):
parsed_url = urlparse(url)
if not parsed_url.netloc:
return '' return ''
# 处理路径中的多余斜杠 if "<" in url and url.endswith(">"):
path = quote(re.sub(r'//+', '/', parsed_url.path)) if '<javascript:void' in url:
print(url)
# 构建查询字符串 # 暂时应对 crawl4ai 的特殊情况
query = f"?{quote(parsed_url.query)}" if parsed_url.query else "" part1, part2 = url.split("<")
if part2.startswith("http"):
# 构建完整URL url = part2[:-1]
if not parsed_url.scheme: else:
# just try https parsed_base = urlparse(part1)
return f"https://{parsed_url.netloc}{path}{parsed_url.params}{query}" url = f"{parsed_base.scheme}://{parsed_base.netloc}/{part2[:-1]}"
if url.startswith("www."):
_url = f"https://{url}"
elif url.startswith("//"):
_url = f"https:{url}"
elif url.startswith(('http:/', 'https:/')):
_url = url
elif url.startswith('/'):
if base_url.endswith('/'):
_url = base_url[:-1] + url
else:
_url = base_url + url
else: else:
return f"{parsed_url.scheme}://{parsed_url.netloc}{path}{parsed_url.params}{query}" _url = urljoin(base_url, url)
# 处理url中path部分的多余斜杠
parsed = urlparse(_url)
path = parsed.path
# 将连续的多个/替换为单个/
normalized_path = re.sub(r'/+', '/', path)
# 重新组装url
_url = f"{parsed.scheme}://{parsed.netloc}{normalized_path}"
if parsed.query:
_url = f"{_url}?{parsed.query}"
if parsed.fragment:
_url = f"{_url}#{parsed.fragment}"
return _url
def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> tuple[dict, tuple[str, dict]]: def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> tuple[dict, tuple[str, dict]]:
link_dict = {} link_dict = {}
@ -110,17 +130,9 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
_url = re.sub(quote_pattern, '', link_url).strip() _url = re.sub(quote_pattern, '', link_url).strip()
if not _url or _url.startswith('#'): if not _url or _url.startswith('#'):
continue continue
if _url.startswith('//'): url = normalize_url(_url, base_url)
_url = f"https:{_url}" if not url:
else:
if _url.startswith('/'):
_url = _url[1:]
_url = urljoin(base_url, _url)
_url = normalize_url(_url)
if not _url:
continue continue
url = _url.lower()
# 检查链接是否是常见文件类型或顶级域名 # 检查链接是否是常见文件类型或顶级域名
has_common_ext = any(url.endswith(ext) for ext in common_file_exts) has_common_ext = any(url.endswith(ext) for ext in common_file_exts)
has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds) has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds)
@ -164,21 +176,13 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
if not img_src or img_src.startswith('#'): if not img_src or img_src.startswith('#'):
continue continue
img_src = img_src.lower() img_src = normalize_url(img_src, base_url)
if not img_src:
continue
if any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): if any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds):
continue continue
if any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): if any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
continue continue
if img_src.startswith('//'):
img_src = f"https:{img_src}"
else:
if img_src.startswith('/'):
img_src = img_src[1:]
img_src = urljoin(base_url, img_src)
img_src = normalize_url(img_src)
if not img_src:
continue
link_dict[url] = f"{img_alt}§to_be_recognized_by_visual_llm_{img_src}§" link_dict[url] = f"{img_alt}§to_be_recognized_by_visual_llm_{img_src}§"
return '' return ''
@ -202,6 +206,10 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
if not src or src.startswith('#'): if not src or src.startswith('#'):
html_text = html_text.replace(match, alt) html_text = html_text.replace(match, alt)
continue continue
src = normalize_url(src, base_url)
if not src:
html_text = html_text.replace(match, alt)
continue
if any(src.endswith(tld) or src.endswith(tld + '/') for tld in common_tlds): if any(src.endswith(tld) or src.endswith(tld + '/') for tld in common_tlds):
html_text = html_text.replace(match, alt) html_text = html_text.replace(match, alt)
@ -209,17 +217,6 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
if any(src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): if any(src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']):
html_text = html_text.replace(match, alt) html_text = html_text.replace(match, alt)
continue continue
if src.startswith('//'):
src = f"https:{src}"
else:
if src.startswith('/'):
src = src[1:]
src = urljoin(base_url, src)
src = normalize_url(src)
if not src:
html_text = html_text.replace(match, alt)
continue
html_text = html_text.replace(match, f" {alt}§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern html_text = html_text.replace(match, f" {alt}§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern
# 接下来要处理所有的[]()文本了 # 接下来要处理所有的[]()文本了
@ -239,16 +236,9 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
_url = re.sub(quote_pattern, '', link_url).strip() _url = re.sub(quote_pattern, '', link_url).strip()
if not _url or _url.startswith('#'): if not _url or _url.startswith('#'):
continue continue
if _url.startswith('//'): url = normalize_url(_url, base_url)
_url = f"https:{_url}" if not url:
else:
if _url.startswith('/'):
_url = _url[1:]
_url = urljoin(base_url, _url)
_url = normalize_url(_url)
if not _url:
continue continue
url = _url.lower()
key = f"Ref_{len(text_link_map)+1}" key = f"Ref_{len(text_link_map)+1}"
text_link_map[key] = url text_link_map[key] = url
@ -258,7 +248,7 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) ->
url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])' url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])'
matches = re.findall(url_pattern, html_text) matches = re.findall(url_pattern, html_text)
for url in matches: for url in matches:
url = normalize_url(url) url = normalize_url(url, base_url)
if not url: if not url:
continue continue
key = f"Ref_{len(text_link_map)+1}" key = f"Ref_{len(text_link_map)+1}"