2024-04-07 09:37:47 +08:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
import os
|
|
|
|
import re
|
2024-12-05 12:11:28 +08:00
|
|
|
# import jieba
|
|
|
|
from loguru import logger
|
2024-04-07 09:37:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
def isURL(string):
|
2024-06-22 16:47:13 +08:00
|
|
|
if string.startswith("www."):
|
|
|
|
string = f"https://{string}"
|
2024-04-07 09:37:47 +08:00
|
|
|
result = urlparse(string)
|
|
|
|
return result.scheme != '' and result.netloc != ''
|
|
|
|
|
|
|
|
|
2024-06-13 21:08:58 +08:00
|
|
|
def extract_urls(text):
|
2024-06-22 16:47:13 +08:00
|
|
|
# Regular expression to match http, https, and www URLs
|
|
|
|
url_pattern = re.compile(r'((?:https?://|www\.)[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])')
|
2024-06-13 21:08:58 +08:00
|
|
|
urls = re.findall(url_pattern, text)
|
2024-06-22 16:47:13 +08:00
|
|
|
# urls = {quote(url.rstrip('/'), safe='/:?=&') for url in urls}
|
|
|
|
cleaned_urls = set()
|
|
|
|
for url in urls:
|
|
|
|
if url.startswith("www."):
|
|
|
|
url = f"https://{url}"
|
|
|
|
parsed_url = urlparse(url)
|
|
|
|
if not parsed_url.netloc:
|
|
|
|
continue
|
|
|
|
# remove hash fragment
|
|
|
|
if not parsed_url.scheme:
|
|
|
|
# just try https
|
|
|
|
cleaned_urls.add(f"https://{parsed_url.netloc}{parsed_url.path}{parsed_url.params}{parsed_url.query}")
|
|
|
|
else:
|
|
|
|
cleaned_urls.add(
|
|
|
|
f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}{parsed_url.params}{parsed_url.query}")
|
2024-06-13 21:08:58 +08:00
|
|
|
return cleaned_urls
|
|
|
|
|
|
|
|
|
2024-04-07 09:37:47 +08:00
|
|
|
def isChinesePunctuation(char):
|
2024-06-15 15:41:31 +08:00
|
|
|
# Define the Unicode encoding range for Chinese punctuation marks
|
2024-04-07 09:37:47 +08:00
|
|
|
chinese_punctuations = set(range(0x3000, 0x303F)) | set(range(0xFF00, 0xFFEF))
|
2024-06-15 15:41:31 +08:00
|
|
|
# Check if the character is within the above range
|
2024-04-07 09:37:47 +08:00
|
|
|
return ord(char) in chinese_punctuations
|
|
|
|
|
|
|
|
|
|
|
|
def is_chinese(string):
|
|
|
|
"""
|
2024-06-15 15:41:31 +08:00
|
|
|
:param string: {str} The string to be detected
|
|
|
|
:return: {bool} Returns True if most are Chinese, False otherwise
|
2024-04-07 09:37:47 +08:00
|
|
|
"""
|
|
|
|
pattern = re.compile(r'[^\u4e00-\u9fa5]')
|
|
|
|
non_chinese_count = len(pattern.findall(string))
|
2024-06-15 15:41:31 +08:00
|
|
|
# It is easy to misjudge strictly according to the number of bytes less than half.
|
|
|
|
# English words account for a large number of bytes, and there are punctuation marks, etc
|
2024-04-07 09:37:47 +08:00
|
|
|
return (non_chinese_count/len(string)) < 0.68
|
|
|
|
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
def extract_and_convert_dates(input_string):
|
|
|
|
# 定义匹配不同日期格式的正则表达式
|
2024-06-16 14:33:21 +08:00
|
|
|
if not isinstance(input_string, str):
|
|
|
|
return None
|
|
|
|
|
2024-04-09 11:38:51 +08:00
|
|
|
patterns = [
|
2024-06-15 15:41:31 +08:00
|
|
|
r'(\d{4})-(\d{2})-(\d{2})', # YYYY-MM-DD
|
|
|
|
r'(\d{4})/(\d{2})/(\d{2})', # YYYY/MM/DD
|
|
|
|
r'(\d{4})\.(\d{2})\.(\d{2})', # YYYY.MM.DD
|
|
|
|
r'(\d{4})\\(\d{2})\\(\d{2})', # YYYY\MM\DD
|
|
|
|
r'(\d{4})(\d{2})(\d{2})' # YYYYMMDD
|
2024-04-09 11:38:51 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
matches = []
|
|
|
|
for pattern in patterns:
|
|
|
|
matches = re.findall(pattern, input_string)
|
|
|
|
if matches:
|
|
|
|
break
|
|
|
|
if matches:
|
2024-12-06 11:42:22 +08:00
|
|
|
return '-'.join(matches[0])
|
|
|
|
return None
|
2024-04-29 23:06:17 +08:00
|
|
|
|
|
|
|
|
2024-12-05 12:11:28 +08:00
|
|
|
def get_logger(logger_name: str, logger_file_path: str):
|
|
|
|
level = 'DEBUG' if os.environ.get("VERBOSE", "").lower() in ["true", "1"] else 'INFO'
|
|
|
|
logger_file = os.path.join(logger_file_path, f"{logger_name}.log")
|
|
|
|
if not os.path.exists(logger_file_path):
|
|
|
|
os.makedirs(logger_file_path)
|
|
|
|
logger.add(logger_file, level=level, backtrace=True, diagnose=True, rotation="50 MB")
|
|
|
|
return logger
|
|
|
|
|
|
|
|
"""
|
2024-06-13 21:08:58 +08:00
|
|
|
def compare_phrase_with_list(target_phrase, phrase_list, threshold):
|
2024-12-05 12:11:28 +08:00
|
|
|
|
2024-06-15 15:41:31 +08:00
|
|
|
Compare the similarity of a target phrase to each phrase in the phrase list.
|
2024-06-13 21:08:58 +08:00
|
|
|
|
2024-06-15 15:41:31 +08:00
|
|
|
: Param target_phrase: target phrase (str)
|
|
|
|
: Param phrase_list: list of str
|
|
|
|
: param threshold: similarity threshold (float)
|
|
|
|
: Return: list of phrases that satisfy the similarity condition (list of str)
|
2024-12-05 12:11:28 +08:00
|
|
|
|
2024-06-13 21:08:58 +08:00
|
|
|
if not target_phrase:
|
2024-06-15 15:41:31 +08:00
|
|
|
return [] # The target phrase is empty, and the empty list is returned directly.
|
2024-06-13 21:08:58 +08:00
|
|
|
|
2024-06-15 15:41:31 +08:00
|
|
|
# Preprocessing: Segmentation of the target phrase and each phrase in the phrase list
|
2024-06-13 21:08:58 +08:00
|
|
|
target_tokens = set(jieba.lcut(target_phrase))
|
|
|
|
tokenized_phrases = {phrase: set(jieba.lcut(phrase)) for phrase in phrase_list}
|
|
|
|
|
|
|
|
similar_phrases = [phrase for phrase, tokens in tokenized_phrases.items()
|
|
|
|
if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold]
|
|
|
|
|
|
|
|
return similar_phrases
|
2024-12-05 12:11:28 +08:00
|
|
|
"""
|