wiseflow/client/backend/get_search.py
2024-04-09 11:38:51 +08:00

116 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from scrapers.simple_crawler import simple_crawler
from typing import Union
from pathlib import Path
import requests
import re
import json
from urllib.parse import quote, urlparse
from bs4 import BeautifulSoup
# 国内的应用场景sogou搜索应该不错了还支持weixin、百科搜索
# 海外的应用场景可以考虑使用duckduckgo或者google_search的sdk
# 尽量还是不要自己host一个搜索引擎吧虽然有类似https://github.com/StractOrg/stract/tree/main的开源方案但毕竟这是两套工程
def search_insight(keyword: str, exist_urls: list[Union[str, Path]], knowledge: bool = False, logger=None) -> (int, list):
"""
搜索网页
:param keyword: 要搜索的主题
:param exist_urls: 已经存在的url列表即这些网页已经存在搜索结果中如果出现则跳过
:param knowledge: 是否搜索知识
:param logger: 日志
:return: 返回文章信息列表list[dict]和flag负数为报错0为没有结果11为成功
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
}
# 如果knowledge参数为真则意味着搜索概念知识这时只会搜索sogou百科
# 默认是搜索新闻资讯同时搜索sogou网页和资讯
if knowledge:
url = f"https://www.sogou.com/sogou?query={keyword}&insite=baike.sogou.com"
else:
url = quote(f"https://www.sogou.com/web?query={keyword}", safe='/:?=.')
relist = []
try:
r = requests.get(url, headers=headers)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
item_list = soup.find_all(class_='struct201102')
for items in item_list:
item_prelist = items.find(class_="vr-title")
# item_title = re.sub(r'(<[^>]+>|\s)', '', str(item_prelist))
href_s = item_prelist.find(class_="", href=True)
href = href_s["href"]
if href[0] == "/":
href_f = redirect_url("https://www.sogou.com" + href)
else:
href_f = href
if href_f not in exist_urls:
relist.append(href_f)
except Exception as e:
if logger:
logger.error(f"search {url} error: {e}")
else:
print(f"search {url} error: {e}")
if not knowledge:
url = f"https://www.sogou.com/sogou?ie=utf8&p=40230447&interation=1728053249&interV=&pid=sogou-wsse-7050094b04fd9aa3&query={keyword}"
try:
r = requests.get(url, headers=headers)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
item_list = soup.find_all(class_="news200616")
for items in item_list:
item_prelist = items.find(class_="vr-title")
# item_title = re.sub(r'(<[^>]+>|\s)', '', str(item_prelist))
href_s = item_prelist.find(class_="", href=True)
href = href_s["href"]
if href[0] == "/":
href_f = redirect_url("https://www.sogou.com" + href)
else:
href_f = href
if href_f not in exist_urls:
relist.append(href_f)
except Exception as e:
if logger:
logger.error(f"search {url} error: {e}")
else:
print(f"search {url} error: {e}")
if not relist:
return -7, []
# 这里仅使用simple_crawler, 因为search行为要快
results = []
for url in relist:
if url in exist_urls:
continue
exist_urls.append(url)
flag, value = simple_crawler(url)
if flag != 11:
continue
from_site = urlparse(url).netloc
if from_site.startswith('www.'):
from_site = from_site.replace('www.', '')
from_site = from_site.split('.')[0]
if value['abstract']:
value['abstract'] = f"({from_site} 报道){value['abstract']}"
value['content'] = f"({from_site} 报道){value['content']}"
value['images'] = json.dumps(value['images'])
results.append(value)
if results:
return 11, results
return 0, []
def redirect_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
}
r = requests.get(url, headers=headers, allow_redirects=False) # 不允许重定向
if r.status_code == 302:
real_url = r.headers.get('Location')
else:
real_url = re.findall("URL='(.*?)'", r.text)[0]
return real_url