2024-06-16 14:33:21 +08:00
# -*- coding: utf-8 -*-
# when you use this general crawler, remember followings
# When you receive flag -7, it means that the problem occurs in the HTML fetch process.
# When you receive flag 0, it means that the problem occurred during the content parsing process.
2024-06-19 10:05:10 +08:00
# when you receive flag 1, the result would be a list, means that the input url is possible a article_list page and the list contains the url of the articles.
# when you receive flag 11, you will get the dict contains the title, content, url, date, and the source of the article.
2024-06-16 14:33:21 +08:00
from gne import GeneralNewsExtractor
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
from urllib . parse import urlparse
from llms . openai_wrapper import openai_llm
# from llms.siliconflow_wrapper import sfa_llm
from bs4 . element import Comment
from utils . general_utils import extract_and_convert_dates
import asyncio
import json_repair
import os
2024-06-19 10:05:10 +08:00
from typing import Union
from requests . compat import urljoin
from scrapers import scraper_map
2024-06-16 14:33:21 +08:00
model = os . environ . get ( ' HTML_PARSE_MODEL ' , ' gpt-3.5-turbo ' )
header = {
' User-Agent ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0 ' }
extractor = GeneralNewsExtractor ( )
def tag_visible ( element : Comment ) - > bool :
if element . parent . name in [ " style " , " script " , " head " , " title " , " meta " , " [document] " ] :
return False
if isinstance ( element , Comment ) :
return False
return True
def text_from_soup ( soup : BeautifulSoup ) - > str :
res = [ ]
texts = soup . find_all ( string = True )
visible_texts = filter ( tag_visible , texts )
for v in visible_texts :
res . append ( v )
text = " \n " . join ( res )
return text . strip ( )
2024-06-19 10:05:10 +08:00
sys_info = ''' Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content:
- The document ' s title
- The complete main content , as it appears in the HTML , comprising all textual elements considered part of the core article body
- The publication time in its original format found within the HTML
Ensure your response fits the following JSON structure , accurately reflecting the extracted data without modification :
2024-06-16 14:33:21 +08:00
` ` ` json
{
2024-06-19 10:05:10 +08:00
" title " : " The Document ' s Exact Title " ,
" content " : " All the unaltered primary text content from the article " ,
" publish_time " : " Original Publication Time as per HTML "
2024-06-16 14:33:21 +08:00
}
` ` `
2024-06-19 10:05:10 +08:00
It is essential that your output adheres strictly to this format , with each field filled based on the untouched information extracted directly from the HTML source . '''
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
async def general_crawler ( url : str , logger ) - > tuple [ int , Union [ list , dict ] ] :
2024-06-16 14:33:21 +08:00
"""
2024-06-19 10:05:10 +08:00
Return article information dict and flag , negative number is error , 0 is no result , 1 is for article_list page , 11 is success
2024-06-16 14:33:21 +08:00
main work flow :
2024-06-19 10:05:10 +08:00
( for weixin public account artilces , which startswith mp . weixin . qq use mp_crawler )
2024-06-16 14:33:21 +08:00
first get the content with httpx
2024-06-19 10:05:10 +08:00
then judge is article list ( return all article url and flag 1 ) or article detail page
2024-06-16 14:33:21 +08:00
then try to use gne to extract the information
when fail , try to use a llm to analysis the html
"""
2024-06-19 10:05:10 +08:00
# 0. if there's a scraper for this domain, use it (such as mp.weixin.qq.com)
parsed_url = urlparse ( url )
domain = parsed_url . netloc
if domain in scraper_map :
return await scraper_map [ domain ] ( url , logger )
# 1. get the content with httpx
2024-06-16 14:33:21 +08:00
async with httpx . AsyncClient ( ) as client :
for retry in range ( 2 ) :
try :
response = await client . get ( url , headers = header , timeout = 30 )
response . raise_for_status ( )
break
except Exception as e :
if retry < 1 :
2024-06-19 10:05:10 +08:00
logger . info ( f " can not reach \n { e } \n waiting 1min " )
2024-06-16 14:33:21 +08:00
await asyncio . sleep ( 60 )
else :
2024-06-19 10:05:10 +08:00
logger . error ( e )
2024-06-16 14:33:21 +08:00
return - 7 , { }
2024-06-19 10:05:10 +08:00
# 2. judge is article list (return all article url and flag 1) or article detail page
page_source = response . text
if page_source :
text = page_source
else :
try :
text = response . content . decode ( ' utf-8 ' )
except UnicodeDecodeError :
try :
text = response . content . decode ( ' gbk ' )
except Exception as e :
logger . error ( f " can not decode html { e } " )
return - 7 , { }
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
soup = BeautifulSoup ( text , " html.parser " )
2024-06-19 20:00:53 +08:00
# Note: The scheme used here is very crude, and it is recommended to write a separate parser for specific business scenarios
2024-06-19 10:05:10 +08:00
# Parse all URLs
2024-06-19 20:00:53 +08:00
if len ( url ) < 50 :
base_url = f " { parsed_url . scheme } :// { domain } "
urls = set ( )
for link in soup . find_all ( " a " , href = True ) :
2024-06-21 13:55:25 +08:00
absolute_url = urljoin ( base_url , link [ " href " ] ) . rstrip ( ' / ' )
2024-06-19 20:00:53 +08:00
if urlparse ( absolute_url ) . netloc == domain and absolute_url != url :
urls . add ( absolute_url )
if len ( urls ) > 30 :
logger . info ( f " { url } is more like an article list page, find { len ( urls ) } urls with the same netloc " )
return 1 , list ( urls )
2024-06-19 10:05:10 +08:00
# 3. try to use gne to extract the information
2024-06-16 14:33:21 +08:00
try :
result = extractor . extract ( text )
2024-06-20 15:01:27 +08:00
if ' meta ' in result :
del result [ ' meta ' ]
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if result [ ' title ' ] . startswith ( ' 服务器错误 ' ) or result [ ' title ' ] . startswith ( ' 您访问的页面 ' ) or result [
' title ' ] . startswith ( ' 403 ' ) \
or result [ ' content ' ] . startswith ( ' This website uses cookies ' ) or result [ ' title ' ] . startswith ( ' 出错了 ' ) :
logger . warning ( f " can not get { url } from the Internet " )
return - 7 , { }
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if len ( result [ ' title ' ] ) < 4 or len ( result [ ' content ' ] ) < 24 :
logger . info ( f " gne extract not good: { result } " )
result = None
except Exception as e :
logger . info ( f " gne extract error: { e } " )
2024-06-16 14:33:21 +08:00
result = None
2024-06-19 10:05:10 +08:00
# 4. try to use a llm to analysis the html
if not result :
2024-06-16 14:33:21 +08:00
html_text = text_from_soup ( soup )
html_lines = html_text . split ( ' \n ' )
html_lines = [ line . strip ( ) for line in html_lines if line . strip ( ) ]
html_text = " \n " . join ( html_lines )
if len ( html_text ) > 29999 :
logger . info ( f " { url } content too long for llm parsing " )
return 0 , { }
if not html_text or html_text . startswith ( ' 服务器错误 ' ) or html_text . startswith (
' 您访问的页面 ' ) or html_text . startswith ( ' 403 ' ) \
or html_text . startswith ( ' 出错了 ' ) :
logger . warning ( f " can not get { url } from the Internet " )
return - 7 , { }
messages = [
{ " role " : " system " , " content " : sys_info } ,
{ " role " : " user " , " content " : html_text }
]
2024-06-19 10:05:10 +08:00
llm_output = openai_llm ( messages , model = model , logger = logger , temperature = 0.01 )
result = json_repair . repair_json ( llm_output , return_objects = True )
logger . debug ( f " decoded_object: { result } " )
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
if not isinstance ( result , dict ) :
2024-06-16 14:33:21 +08:00
logger . debug ( " failed to parse from llm output " )
return 0 , { }
2024-06-19 10:05:10 +08:00
if ' title ' not in result or ' content ' not in result :
2024-06-16 14:33:21 +08:00
logger . debug ( " llm parsed result not good " )
return 0 , { }
# Extract the picture link, it will be empty if it cannot be extracted.
image_links = [ ]
images = soup . find_all ( " img " )
for img in images :
try :
image_links . append ( img [ " src " ] )
except KeyError :
continue
2024-06-19 10:05:10 +08:00
result [ " images " ] = image_links
2024-06-16 14:33:21 +08:00
# Extract the author information, if it cannot be extracted, it will be empty.
author_element = soup . find ( " meta " , { " name " : " author " } )
if author_element :
2024-06-19 10:05:10 +08:00
result [ " author " ] = author_element [ " content " ]
2024-06-16 14:33:21 +08:00
else :
2024-06-19 10:05:10 +08:00
result [ " author " ] = " "
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
# 5. post process
date_str = extract_and_convert_dates ( result [ ' publish_time ' ] )
2024-06-16 14:33:21 +08:00
if date_str :
2024-06-19 10:05:10 +08:00
result [ ' publish_time ' ] = date_str
2024-06-16 14:33:21 +08:00
else :
2024-06-19 10:05:10 +08:00
result [ ' publish_time ' ] = datetime . strftime ( datetime . today ( ) , " % Y % m %d " )
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
from_site = domain . replace ( ' www. ' , ' ' )
2024-06-16 14:33:21 +08:00
from_site = from_site . split ( ' . ' ) [ 0 ]
2024-06-19 10:05:10 +08:00
result [ ' content ' ] = f " [from { from_site } ] { result [ ' content ' ] } "
2024-06-16 14:33:21 +08:00
try :
meta_description = soup . find ( " meta " , { " name " : " description " } )
if meta_description :
2024-06-19 10:05:10 +08:00
result [ ' abstract ' ] = f " [from { from_site } ] { meta_description [ ' content ' ] . strip ( ) } "
2024-06-16 14:33:21 +08:00
else :
2024-06-19 10:05:10 +08:00
result [ ' abstract ' ] = ' '
2024-06-16 14:33:21 +08:00
except Exception :
2024-06-19 10:05:10 +08:00
result [ ' abstract ' ] = ' '
2024-06-16 14:33:21 +08:00
2024-06-19 10:05:10 +08:00
result [ ' url ' ] = url
return 11 , result