mirror of
https://github.com/TeamWiseFlow/wiseflow.git
synced 2025-01-23 02:20:20 +08:00
little fix
This commit is contained in:
parent
10cda47778
commit
c20c4a0a27
@ -20,6 +20,7 @@ existing_urls = [url['url'] for url in pb.read(collection_name='articles', field
|
||||
|
||||
|
||||
async def pipeline(url: str, cache: dict = {}):
|
||||
url = url.rstrip('/')
|
||||
working_list = [url]
|
||||
while working_list:
|
||||
url = working_list[0]
|
||||
@ -53,8 +54,6 @@ async def pipeline(url: str, cache: dict = {}):
|
||||
|
||||
# get info process
|
||||
logger.debug(f"article: {result['title']}")
|
||||
insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
|
||||
|
||||
article_id = pb.add(collection_name='articles', body=result)
|
||||
if not article_id:
|
||||
await asyncio.sleep(1)
|
||||
@ -66,6 +65,7 @@ async def pipeline(url: str, cache: dict = {}):
|
||||
json.dump(result, f, ensure_ascii=False, indent=4)
|
||||
continue
|
||||
|
||||
insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}")
|
||||
if not insights:
|
||||
continue
|
||||
|
||||
|
@ -119,7 +119,7 @@ async def general_crawler(url: str, logger) -> tuple[int, Union[list, dict]]:
|
||||
base_url = f"{parsed_url.scheme}://{domain}"
|
||||
urls = set()
|
||||
for link in soup.find_all("a", href=True):
|
||||
absolute_url = urljoin(base_url, link["href"])
|
||||
absolute_url = urljoin(base_url, link["href"]).rstrip('/')
|
||||
if urlparse(absolute_url).netloc == domain and absolute_url != url:
|
||||
urls.add(absolute_url)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user