little fix

This commit is contained in:
bigbrother666 2024-06-21 13:55:25 +08:00
parent 10cda47778
commit c20c4a0a27
2 changed files with 3 additions and 3 deletions

View File

@ -20,6 +20,7 @@ existing_urls = [url['url'] for url in pb.read(collection_name='articles', field
async def pipeline(url: str, cache: dict = {}):
url = url.rstrip('/')
working_list = [url]
while working_list:
url = working_list[0]
@ -53,8 +54,6 @@ async def pipeline(url: str, cache: dict = {}):
# get info process
logger.debug(f"article: {result['title']}")
insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
article_id = pb.add(collection_name='articles', body=result)
if not article_id:
await asyncio.sleep(1)
@ -66,6 +65,7 @@ async def pipeline(url: str, cache: dict = {}):
json.dump(result, f, ensure_ascii=False, indent=4)
continue
insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
if not insights:
continue

View File

@ -119,7 +119,7 @@ async def general_crawler(url: str, logger) -> tuple[int, Union[list, dict]]:
base_url = f"{parsed_url.scheme}://{domain}"
urls = set()
for link in soup.find_all("a", href=True):
absolute_url = urljoin(base_url, link["href"])
absolute_url = urljoin(base_url, link["href"]).rstrip('/')
if urlparse(absolute_url).netloc == domain and absolute_url != url:
urls.add(absolute_url)