From c20c4a0a278c4e5b5caf2da2accbe1e7e94e9e9c Mon Sep 17 00:00:00 2001
From: bigbrother666 <zeming.zhao@gmail.com>
Date: Fri, 21 Jun 2024 13:55:25 +0800
Subject: [PATCH] little fix

---
 core/insights/__init__.py        | 4 ++--
 core/scrapers/general_crawler.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/insights/__init__.py b/core/insights/__init__.py
index 36a8405..d7dcc80 100644
--- a/core/insights/__init__.py
+++ b/core/insights/__init__.py
@@ -20,6 +20,7 @@ existing_urls = [url['url'] for url in pb.read(collection_name='articles', field
 
 
 async def pipeline(url: str, cache: dict = {}):
+    url = url.rstrip('/')
     working_list = [url]
     while working_list:
         url = working_list[0]
@@ -53,8 +54,6 @@ async def pipeline(url: str, cache: dict = {}):
 
         # get info process
         logger.debug(f"article: {result['title']}")
-        insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
-
         article_id = pb.add(collection_name='articles', body=result)
         if not article_id:
             await asyncio.sleep(1)
@@ -66,6 +65,7 @@ async def pipeline(url: str, cache: dict = {}):
                     json.dump(result, f, ensure_ascii=False, indent=4)
                 continue
 
+        insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
         if not insights:
             continue
 
diff --git a/core/scrapers/general_crawler.py b/core/scrapers/general_crawler.py
index 7487bed..0efcf60 100644
--- a/core/scrapers/general_crawler.py
+++ b/core/scrapers/general_crawler.py
@@ -119,7 +119,7 @@ async def general_crawler(url: str, logger) -> tuple[int, Union[list, dict]]:
             base_url = f"{parsed_url.scheme}://{domain}"
             urls = set()
             for link in soup.find_all("a", href=True):
-                absolute_url = urljoin(base_url, link["href"])
+                absolute_url = urljoin(base_url, link["href"]).rstrip('/')
                 if urlparse(absolute_url).netloc == domain and absolute_url != url:
                     urls.add(absolute_url)