From c6d05d02109604d2c34aa882fcd5f15d91bd5b46 Mon Sep 17 00:00:00 2001 From: bigbrother666sh Date: Fri, 3 Jan 2025 13:17:24 +0800 Subject: [PATCH] update deepscraper for crawl4ai bug --- core/general_process.py | 7 ++- core/utils/deep_scraper.py | 112 +++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 63 deletions(-) diff --git a/core/general_process.py b/core/general_process.py index 8581268..2446098 100644 --- a/core/general_process.py +++ b/core/general_process.py @@ -76,8 +76,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: if base_tag and base_tag.get('href'): base_url = base_tag['href'] else: - # if no base tag, use the current url as base url - base_url = f"{parsed_url.scheme}://{domain}" + # 如果没有 base 标签,使用当前页面的 URL 路径作为 base url + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + if not base_url.endswith('/'): + # 如果路径不以 / 结尾,则去掉最后一个路径段 + base_url = base_url.rsplit('/', 1)[0] + '/' html = await context.page.inner_html('body') if domain in custom_scrapers: diff --git a/core/utils/deep_scraper.py b/core/utils/deep_scraper.py index d224770..2756b58 100644 --- a/core/utils/deep_scraper.py +++ b/core/utils/deep_scraper.py @@ -8,7 +8,7 @@ import os, re import json import time -from urllib.parse import urlparse, urljoin, quote +from urllib.parse import urlparse, urljoin common_file_exts = [ @@ -27,26 +27,46 @@ common_tlds = [ common_chars = ',.!;:,;:、一二三四五六七八九十#*@% \t\n\r|*-_…>#' -def normalize_url(url: str) -> str: - if url.lower().startswith("www."): - url = f"https://{url}" - - parsed_url = urlparse(url) - if not parsed_url.netloc: +def normalize_url(url: str, base_url: str) -> str: + url = url.strip().lower() + if url.startswith(("javascript:", "mailto:", "javacript:", "tel:", "sms:", "data:", "file:", "ftp:", "about:", "chrome:", "blob:", "ws:", "wss:", "view-source:")): return '' - # 处理路径中的多余斜杠 - path = quote(re.sub(r'//+', '/', parsed_url.path)) - - # 构建查询字符串 - query = f"?{quote(parsed_url.query)}" if parsed_url.query else "" - - # 构建完整URL - if not parsed_url.scheme: - # just try https - return f"https://{parsed_url.netloc}{path}{parsed_url.params}{query}" + if "<" in url and url.endswith(">"): + if ' tuple[dict, tuple[str, dict]]: link_dict = {} @@ -110,17 +130,9 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> _url = re.sub(quote_pattern, '', link_url).strip() if not _url or _url.startswith('#'): continue - if _url.startswith('//'): - _url = f"https:{_url}" - else: - if _url.startswith('/'): - _url = _url[1:] - _url = urljoin(base_url, _url) - _url = normalize_url(_url) - if not _url: + url = normalize_url(_url, base_url) + if not url: continue - - url = _url.lower() # 检查链接是否是常见文件类型或顶级域名 has_common_ext = any(url.endswith(ext) for ext in common_file_exts) has_common_tld = any(url.endswith(tld) or url.endswith(tld + '/') for tld in common_tlds) @@ -164,21 +176,13 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> if not img_src or img_src.startswith('#'): continue - img_src = img_src.lower() + img_src = normalize_url(img_src, base_url) + if not img_src: + continue if any(img_src.endswith(tld) or img_src.endswith(tld + '/') for tld in common_tlds): continue if any(img_src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): continue - - if img_src.startswith('//'): - img_src = f"https:{img_src}" - else: - if img_src.startswith('/'): - img_src = img_src[1:] - img_src = urljoin(base_url, img_src) - img_src = normalize_url(img_src) - if not img_src: - continue link_dict[url] = f"{img_alt}§to_be_recognized_by_visual_llm_{img_src}§" return '' @@ -202,6 +206,10 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> if not src or src.startswith('#'): html_text = html_text.replace(match, alt) continue + src = normalize_url(src, base_url) + if not src: + html_text = html_text.replace(match, alt) + continue if any(src.endswith(tld) or src.endswith(tld + '/') for tld in common_tlds): html_text = html_text.replace(match, alt) @@ -209,17 +217,6 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> if any(src.endswith(ext) for ext in common_file_exts if ext not in ['jpg', 'jpeg', 'png']): html_text = html_text.replace(match, alt) continue - - if src.startswith('//'): - src = f"https:{src}" - else: - if src.startswith('/'): - src = src[1:] - src = urljoin(base_url, src) - src = normalize_url(src) - if not src: - html_text = html_text.replace(match, alt) - continue html_text = html_text.replace(match, f" {alt}§to_be_recognized_by_visual_llm_{src[1:]}§") # to avoid conflict with the url pattern # 接下来要处理所有的[]()文本了 @@ -239,16 +236,9 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> _url = re.sub(quote_pattern, '', link_url).strip() if not _url or _url.startswith('#'): continue - if _url.startswith('//'): - _url = f"https:{_url}" - else: - if _url.startswith('/'): - _url = _url[1:] - _url = urljoin(base_url, _url) - _url = normalize_url(_url) - if not _url: + url = normalize_url(_url, base_url) + if not url: continue - url = _url.lower() key = f"Ref_{len(text_link_map)+1}" text_link_map[key] = url @@ -258,7 +248,7 @@ def deep_scraper(raw_markdown: str, base_url: str, used_img: dict[str, str]) -> url_pattern = r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])' matches = re.findall(url_pattern, html_text) for url in matches: - url = normalize_url(url) + url = normalize_url(url, base_url) if not url: continue key = f"Ref_{len(text_link_map)+1}"