2024-06-15 15:41:31 +08:00
# -*- coding: utf-8 -*-
2024-06-14 09:08:12 +08:00
import os
2024-04-07 09:37:47 +08:00
from urllib . parse import urlparse
import re
from . simple_crawler import simple_crawler
2024-06-15 15:41:31 +08:00
from . mp_crawler import mp_crawler
2024-04-09 11:38:51 +08:00
import httpx
2024-04-07 09:37:47 +08:00
from bs4 import BeautifulSoup
from bs4 . element import Comment
2024-06-15 15:41:31 +08:00
from llms . openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
2024-04-07 09:37:47 +08:00
from datetime import datetime , date
from requests . compat import urljoin
2024-04-08 17:58:29 +08:00
import chardet
2024-06-15 15:41:31 +08:00
from utils . general_utils import extract_and_convert_dates
import asyncio
2024-04-07 09:37:47 +08:00
2024-06-14 09:08:12 +08:00
model = os . environ . get ( ' HTML_PARSE_MODEL ' , ' gpt-3.5-turbo ' )
2024-04-07 21:40:26 +08:00
header = {
' User-Agent ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0 ' }
2024-04-07 09:37:47 +08:00
def tag_visible ( element : Comment ) - > bool :
if element . parent . name in [ " style " , " script " , " head " , " title " , " meta " , " [document] " ] :
return False
if isinstance ( element , Comment ) :
return False
return True
def text_from_soup ( soup : BeautifulSoup ) - > str :
res = [ ]
texts = soup . find_all ( string = True )
visible_texts = filter ( tag_visible , texts )
for v in visible_texts :
res . append ( v )
text = " \n " . join ( res )
return text . strip ( )
def parse_html_content ( out : str ) - > dict :
2024-04-09 11:38:51 +08:00
dct = { ' title ' : ' ' , ' abstract ' : ' ' , ' content ' : ' ' , ' publish_time ' : ' ' }
2024-04-08 17:58:29 +08:00
pattern = re . compile ( r ' \ " \ " \ " (.*?) \ " \ " \ " ' , re . DOTALL )
result = pattern . findall ( out )
2024-04-09 11:38:51 +08:00
result = result [ 0 ] . strip ( )
dict_strs = result . split ( ' || ' )
if not dict_strs :
dict_strs = result . split ( ' ||| ' )
if not dict_strs :
return dct
if len ( dict_strs ) == 3 :
dct [ ' title ' ] = dict_strs [ 0 ] . strip ( )
dct [ ' content ' ] = dict_strs [ 1 ] . strip ( )
elif len ( dict_strs ) == 4 :
dct [ ' title ' ] = dict_strs [ 0 ] . strip ( )
dct [ ' content ' ] = dict_strs [ 2 ] . strip ( )
dct [ ' abstract ' ] = dict_strs [ 1 ] . strip ( )
else :
return dct
date_str = extract_and_convert_dates ( dict_strs [ - 1 ] )
2024-04-07 09:37:47 +08:00
if date_str :
2024-04-09 11:38:51 +08:00
dct [ ' publish_time ' ] = date_str
2024-04-07 09:37:47 +08:00
else :
2024-04-09 11:38:51 +08:00
dct [ ' publish_time ' ] = datetime . strftime ( datetime . today ( ) , " % Y % m %d " )
2024-04-07 09:37:47 +08:00
return dct
2024-06-14 09:08:12 +08:00
sys_info = ''' As an HTML parser, you ' ll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
2024-04-07 09:37:47 +08:00
"""
2024-06-14 09:08:12 +08:00
Title | | Summary | | Content | | Release Date YYYY - MM - DD
2024-04-09 11:38:51 +08:00
"""
'''
2024-04-07 09:37:47 +08:00
2024-06-15 15:41:31 +08:00
async def llm_crawler ( url : str , logger ) - > ( int , dict ) :
async with httpx . AsyncClient ( ) as client :
for retry in range ( 2 ) :
try :
response = await client . get ( url , headers = header , timeout = 30 )
response . raise_for_status ( )
break
except Exception as e :
if retry < 1 :
logger . info ( f " request { url } got error { e } \n waiting 1min " )
await asyncio . sleep ( 60 )
else :
logger . warning ( f " request { url } got error { e } " )
return - 7 , { }
rawdata = response . content
encoding = chardet . detect ( rawdata ) [ ' encoding ' ]
text = rawdata . decode ( encoding , errors = ' replace ' )
soup = BeautifulSoup ( text , " html.parser " )
html_text = text_from_soup ( soup )
html_lines = html_text . split ( ' \n ' )
html_lines = [ line . strip ( ) for line in html_lines if line . strip ( ) ]
html_text = " \n " . join ( html_lines )
if len ( html_text ) > 29999 :
logger . warning ( f " { url } content too long for llm parsing " )
return 0 , { }
if not html_text or html_text . startswith ( ' 服务器错误 ' ) or html_text . startswith (
' 您访问的页面 ' ) or html_text . startswith ( ' 403 ' ) \
or html_text . startswith ( ' 出错了 ' ) :
logger . warning ( f " can not get { url } from the Internet " )
return - 7 , { }
2024-04-07 09:37:47 +08:00
messages = [
{ " role " : " system " , " content " : sys_info } ,
{ " role " : " user " , " content " : html_text }
]
2024-06-14 09:08:12 +08:00
llm_output = openai_llm ( messages , model = model , logger = logger )
2024-04-07 09:37:47 +08:00
try :
info = parse_html_content ( llm_output )
2024-06-15 15:41:31 +08:00
except :
2024-04-07 09:37:47 +08:00
msg = f " can not parse { llm_output } "
2024-04-09 11:38:51 +08:00
logger . debug ( msg )
2024-04-07 09:37:47 +08:00
return 0 , { }
2024-04-09 11:38:51 +08:00
if len ( info [ ' title ' ] ) < 4 or len ( info [ ' content ' ] ) < 24 :
logger . debug ( f " { info } not valid " )
2024-04-07 09:37:47 +08:00
return 0 , { }
2024-06-14 09:08:12 +08:00
info [ " url " ] = url
# Extract the picture link, it will be empty if it cannot be extracted.
2024-04-07 09:37:47 +08:00
image_links = [ ]
images = soup . find_all ( " img " )
for img in images :
try :
image_links . append ( img [ " src " ] )
except KeyError :
continue
info [ " images " ] = image_links
2024-06-14 09:08:12 +08:00
# Extract the author information, if it cannot be extracted, it will be empty.
2024-04-07 09:37:47 +08:00
author_element = soup . find ( " meta " , { " name " : " author " } )
if author_element :
info [ " author " ] = author_element [ " content " ]
else :
info [ " author " ] = " "
2024-06-14 09:08:12 +08:00
from_site = urlparse ( url ) . netloc
from_site = from_site . replace ( ' www. ' , ' ' )
from_site = from_site . split ( ' . ' ) [ 0 ]
info [ ' content ' ] = f " [from { from_site } ] { info [ ' content ' ] } "
2024-04-07 21:40:26 +08:00
if not info [ ' abstract ' ] :
meta_description = soup . find ( " meta " , { " name " : " description " } )
if meta_description :
2024-06-14 09:08:12 +08:00
info [ ' abstract ' ] = f " [from { from_site } ] { meta_description [ ' content ' ] . strip ( ) } "
2024-04-07 21:40:26 +08:00
else :
info [ ' abstract ' ] = ' '
2024-04-07 09:37:47 +08:00
return 11 , info
2024-06-15 15:41:31 +08:00
async def general_scraper ( site : str , expiration : date , existing : list [ str ] , logger ) - > list [ dict ] :
async with httpx . AsyncClient ( ) as client :
for retry in range ( 2 ) :
try :
response = await client . get ( site , headers = header , timeout = 30 )
response . raise_for_status ( )
break
except Exception as e :
if retry < 1 :
logger . info ( f " request { site } got error { e } \n waiting 1min " )
await asyncio . sleep ( 60 )
else :
logger . warning ( f " request { site } got error { e } " )
return [ ]
page_source = response . text
soup = BeautifulSoup ( page_source , " html.parser " )
# Parse all URLs
parsed_url = urlparse ( site )
base_url = f " { parsed_url . scheme } :// { parsed_url . netloc } "
urls = [ urljoin ( base_url , link [ " href " ] ) for link in soup . find_all ( " a " , href = True ) ]
2024-04-07 09:37:47 +08:00
if not urls :
2024-06-14 09:08:12 +08:00
# maybe it's an article site
2024-04-09 11:38:51 +08:00
logger . warning ( f " can not find any link from { site } , maybe it ' s an article site... " )
2024-04-07 09:37:47 +08:00
if site in existing :
2024-04-09 11:38:51 +08:00
logger . debug ( f " { site } has been crawled before, skip it " )
2024-04-07 09:37:47 +08:00
return [ ]
2024-06-15 15:41:31 +08:00
if site . startswith ( ' https://mp.weixin.qq.com ' ) or site . startswith ( ' http://mp.weixin.qq.com ' ) :
flag , result = await mp_crawler ( site , logger )
else :
flag , result = await simple_crawler ( site , logger )
if flag == - 7 :
# -7 means cannot fetch the html, and other crawlers have no effect.
return [ ]
2024-04-07 09:37:47 +08:00
if flag != 11 :
2024-06-15 15:41:31 +08:00
flag , result = await llm_crawler ( site , logger )
2024-04-07 09:37:47 +08:00
if flag != 11 :
return [ ]
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
publish_date = datetime . strptime ( result [ ' publish_time ' ] , ' % Y % m %d ' )
if publish_date . date ( ) < expiration :
2024-04-09 11:38:51 +08:00
logger . debug ( f " { site } is too old, skip it " )
2024-04-07 09:37:47 +08:00
return [ ]
else :
return [ result ]
2024-06-14 09:08:12 +08:00
# Then gradually analyze the article, still use simple_crawler first, no use llm_crawler
2024-04-07 09:37:47 +08:00
articles = [ ]
for url in urls :
if url in existing :
2024-04-09 11:38:51 +08:00
logger . debug ( f " { url } has been crawled before, skip it " )
2024-04-07 09:37:47 +08:00
continue
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
existing . append ( url )
2024-06-15 15:41:31 +08:00
if url . startswith ( ' https://mp.weixin.qq.com ' ) or url . startswith ( ' http://mp.weixin.qq.com ' ) :
flag , result = await mp_crawler ( url , logger )
else :
flag , result = await simple_crawler ( url , logger )
if flag == - 7 :
# -7 means cannot fetch the html, and other crawlers have no effect.
continue
2024-04-07 09:37:47 +08:00
if flag != 11 :
2024-06-15 15:41:31 +08:00
flag , result = await llm_crawler ( url , logger )
2024-04-07 09:37:47 +08:00
if flag != 11 :
continue
2024-06-15 15:41:31 +08:00
2024-04-07 21:40:26 +08:00
publish_date = datetime . strptime ( result [ ' publish_time ' ] , ' % Y % m %d ' )
if publish_date . date ( ) < expiration :
2024-04-09 11:38:51 +08:00
logger . debug ( f " { url } is too old, skip it " )
2024-04-07 09:37:47 +08:00
else :
articles . append ( result )
return articles