From c20c4a0a278c4e5b5caf2da2accbe1e7e94e9e9c Mon Sep 17 00:00:00 2001 From: bigbrother666 Date: Fri, 21 Jun 2024 13:55:25 +0800 Subject: [PATCH] little fix --- core/insights/__init__.py | 4 ++-- core/scrapers/general_crawler.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/insights/__init__.py b/core/insights/__init__.py index 36a8405..d7dcc80 100644 --- a/core/insights/__init__.py +++ b/core/insights/__init__.py @@ -20,6 +20,7 @@ existing_urls = [url['url'] for url in pb.read(collection_name='articles', field async def pipeline(url: str, cache: dict = {}): + url = url.rstrip('/') working_list = [url] while working_list: url = working_list[0] @@ -53,8 +54,6 @@ async def pipeline(url: str, cache: dict = {}): # get info process logger.debug(f"article: {result['title']}") - insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}") - article_id = pb.add(collection_name='articles', body=result) if not article_id: await asyncio.sleep(1) @@ -66,6 +65,7 @@ async def pipeline(url: str, cache: dict = {}): json.dump(result, f, ensure_ascii=False, indent=4) continue + insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}") if not insights: continue diff --git a/core/scrapers/general_crawler.py b/core/scrapers/general_crawler.py index 7487bed..0efcf60 100644 --- a/core/scrapers/general_crawler.py +++ b/core/scrapers/general_crawler.py @@ -119,7 +119,7 @@ async def general_crawler(url: str, logger) -> tuple[int, Union[list, dict]]: base_url = f"{parsed_url.scheme}://{domain}" urls = set() for link in soup.find_all("a", href=True): - absolute_url = urljoin(base_url, link["href"]) + absolute_url = urljoin(base_url, link["href"]).rstrip('/') if urlparse(absolute_url).netloc == domain and absolute_url != url: urls.add(absolute_url)