fix: erros

This commit is contained in:
bigbrother666sh 2024-12-08 21:30:39 +08:00
parent ec514b49dd
commit de549c6334
8 changed files with 216 additions and 1065 deletions

View File

@ -6,7 +6,7 @@
**我们缺的不是信息,而是从海量信息中过滤噪音,从而让有价值的信息显露出来**
## 🔥 V0.3.8版本预告
## 🔥 隆重介绍精准产品定位下的全新架构 V0.3.2版本
wiseflow 预计将在2024.12月底前正式升级到0.3.8版本,这也将是 V0.3.x 架构下的最终版本(除非有足够多的小修改,否则不会有 V0.3.9版本)

View File

@ -3,7 +3,7 @@ from llms.openai_wrapper import openai_llm as llm
from utils.general_utils import is_chinese, extract_and_convert_dates, extract_urls
from loguru import logger
from utils.pb_api import PbTalker
import os
import os, re
from datetime import datetime
from urllib.parse import urlparse
import json_repair
@ -56,7 +56,12 @@ class GeneralInfoExtractor:
如果网页文本中不包含任何与兴趣点相关的信息请仅输出[]'''
self.get_more_link_prompt = f"作为一位高效的信息筛选助手你的任务是根据给定的兴趣点从给定的文本及其对应的URL中挑选出最值得关注的URL。兴趣点及其解释如下\n\n{focus_statement}"
self.get_more_link_suffix = "请逐条分析对于每一条首先复制文本然后给出分析依据最后给出结论。如果决定挑选该条在结论后复制对应的url否则的话直接进入下一条的分析。请一条一条的分析不要漏掉任何一条。"
self.get_more_link_suffix = '''请逐条分析,先逐一给出分析依据,最终将挑选出的 url 按一行一条的格式输出,最终输出的 url 列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例:
"""
url1
url2
...
"""'''
else:
self.get_info_prompt = f'''As an information extraction assistant, your task is to extract content related to the following user focus points from the given web page text. The list of focus points and their explanations is as follows:
@ -76,7 +81,12 @@ Example:
If the webpage text does not contain any information related to points of interest, please output only: []'''
self.get_more_link_prompt = f"As an efficient information filtering assistant, your task is to select the most noteworthy URLs from a set of texts and their corresponding URLs based on the given focus points. The focus points and their explanations are as follows:\n\n{focus_statement}"
self.get_more_link_suffix = "Please analyze each item one by one: For each item, first copy the text, then provide the analysis basis, and finally give the conclusion. If the decision is to select the item, copy the corresponding URL after the conclusion; otherwise, proceed directly to the analysis of the next item. Analyze each item one by one, without missing any."
self.get_more_link_suffix = '''Please analyze one by one, first give the analysis basis one by one, and finally output the selected URLs in a row-by-row format. The final output URL list is wrapped in three quotes as a whole, and there should be no other content in the three quotes. Here is an example of the output format:
"""
url1
url2
...
"""'''
async def get_author_and_publish_date(self, text: str) -> tuple[str, str]:
if not text:
@ -107,10 +117,10 @@ If the webpage text does not contain any information related to points of intere
return result['source'], extract_and_convert_dates(result['publish_date'])
async def get_more_related_urls(self, link_dict: dict) -> set[str]:
async def get_more_related_urls(self, link_dict: dict, og_url: str) -> set[str]:
if not link_dict:
return set()
self.logger.debug(f'{len(link_dict)} items to analyze')
urls = set()
content = ''
for key, value in link_dict.items():
@ -118,24 +128,33 @@ If the webpage text does not contain any information related to points of intere
if len(content) > 512:
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
model=self.secondary_model, temperature=0.1)
model=self.model, temperature=0.1)
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
urls.update(extract_urls(result))
result = re.findall(r'"""(.*?)"""', result, re.DOTALL)
if result:
result = result[0].strip()
self.logger.debug(f"cleaned output: {result}")
urls.update(extract_urls(result))
content = ''
if content:
result = await llm([{'role': 'system', 'content': self.get_more_link_prompt},
{'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}],
model=self.secondary_model, temperature=0.1)
model=self.model, temperature=0.1)
self.logger.debug(f'get_more_related_urls llm output:\n{result}')
urls.update(extract_urls(result))
result = re.findall(r'"""(.*?)"""', result, re.DOTALL)
if result:
result = result[0].strip()
self.logger.debug(f"cleaned output: {result}")
urls.update(extract_urls(result))
raw_urls = list(link_dict.values())
for url in urls:
if url not in raw_urls:
self.logger.warning(f"{url} not in link_dict, it's model's Hallucination")
urls.remove(url)
return urls
raw_urls = set(link_dict.values())
urls.discard(og_url)
hallucination_urls = urls - raw_urls
if hallucination_urls:
self.logger.warning(f"{hallucination_urls} not in link_dict, it's model's Hallucination")
return urls & raw_urls
async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]:
if not text:
@ -153,7 +172,7 @@ If the webpage text does not contain any information related to points of intere
self.logger.warning("failed to parse from llm output")
return []
if not result:
self.logger.info("no info found")
self.logger.debug("no info found")
return []
system = '''判断给定的信息是否与网页文本相符。信息将用标签<info></info>包裹,网页文本则用<text></text>包裹。请遵循如下工作流程:
@ -209,7 +228,7 @@ If the webpage text does not contain any information related to points of intere
if not publish_date or publish_date.lower() == 'na':
publish_date = datetime.now().strftime('%Y-%m-%d')
related_urls = await self.get_more_related_urls(link_dict)
related_urls = await self.get_more_related_urls(link_dict, base_url)
info_prefix = f"//{author} {publish_date}//"
lines = text.split('\n')

View File

@ -1,41 +1,83 @@
# wiseflow 自定义解析器说明
## 概述
wiseflow 致力于通过一套通用流程(使用大模型驱动的可以自主使用爬虫工具的智能体)处理所有页面。
不过我们也为客户保留自定义处理的灵活性。您可以在这里添加并注册针对特定域名的自定义爬虫。
目前在页面获取方面我们使用流行的爬虫框架 Crawleeplaywright进行统一管理经过实测 Crawlee 在速度和兼容性方面都非常不错,且有着完善的任务队列管理模块,因此网页获取方面一般无需自定义
请遵照如下规范:
对于页面信息的解析wiseflow 默认使用大模型,但用户可以为特定域名配置自定义解析器。
1、爬虫应该是一个函数而不是类
## 自定义解析器配置说明
2、入参只接受两个**url**(要处理的 url请只提供一个 url而不是列表因为主函数会处理队列逻辑**logger** 对象这意味着不要为你的自定义爬虫添加日志对象wiseflow 会统一管理);
### 1. Scraper 函数定义
Scraper 应该是一个函数(而不是类)。
3、出参必须为两个一个是解析后的文章详情 article类型为 dict另一个是从页面解析出的外链字典 link_dict类型也是 dict。
### 2. 函数参数
该函数接收两个入参wiseflow 框架传入):
- `html`:这是 wiseflow 通过 Crawlee 的 playwright_crawler 获取到的渲染后的页面 html 代码,类型为 `str`scraper 可以直接使用 `bs` `parsel`等库进行解析;
- `url`:当前页面的 url 地址,类型为 `str`(仅是为了特殊操作,用不到的话可以直接忽略)。
article 的字典格式如下(注意,'content' 是必须的,其他也可以没有,另外额外的键值信息会被忽略):
### 3. 函数返回值
Scraper 出参限定为三个:
`{'url': ..., 'title': ..., 'author': ..., 'publish_date': ..., 'screenshot': ..., 'content': ...(not empty)}`
#### 3.1 `article`
解析出的页面详情,类型为 `dict`,格式如下(**注意,'content' 是必须的,其他可以没有,额外的键值信息会被忽略**
上述值的类型都要求为 **str** 日期格式为 **YYYY-MM-DD**screenshot 为**文件路径**,可以是相对于 core目录的相对路径也可以是绝对路径文件类型为 **png**
```python
{
'url': ...,
'author': ...,
'publish_date': ...,
'screenshot': ...,
'content': ...(not empty)
}
```
- 上述值的类型都要求为 `str`,日期格式为 `YYYY-MM-DD`screenshot 为**文件路径**,可以是相对于 core 目录的相对路径也可以是绝对路径,文件类型为 `png`
**注意:**
- 'content' 要有且不为空,不然无法触发后续的提取,文章也会被舍弃。这是唯一要求不为空的项;
- 'author' 和 'publish_date' 尽量有,不然 wiseflow 会自动用域名对应 demain 和 当日日期代替。
1. `'content'` 要有且不为空,不然无法触发后续的提取,文章也会被舍弃。这是唯一要求不为空的项;
2. `'author'``'publish_date'` 尽量有,不然 wiseflow 会自动用域名对应 demain 和 当日日期代替。
link_dict 的格式如下:
`{'text': 外链对应的文字信息, 'url': 外链对应的 url}`
#### 3.2 `links`
对应页面解析出的链接,类型可以是 `set`,也可以是 `dict`
- 如果是 `set`,则会全部被加入任务队列。
- 如果是 `dict`,则会调用 llm 从中挑取值得加入任务队列的 url根据你的 focus point`dict` 的格式如下:
```python
{
'text': 外链对应的文字信息,
'url': 外链对应的 url
}
```
wiseflow 会以这个为输入,使用 llm 判断值得继续爬取的链接。
4、在 core/custom_crawlers/__init__.py 中注册,参考:
#### 3.3 `infos`
对应页面抽取出的值得关注的信息列表,类型是 `list`,元素为 `dict`,格式为:
```pyhton
from .mp import mp_crawler
customer_crawler_map = {'mp.weixin.qq.com': mp_crawler}
```python
{
'tag': focuspoint 的 id,
'content': 具体 info 内容
}
```
注意键使用域名,可以使用 urllib.parse获取
**注意focuspoint 的 id 要和 pb 中 focus_points 表一致**
```pyhton
### 4. 注册自定义解析器
`core/custom_scraper/__init__.py` 中注册,参考:
```python
from .mp import mp_scarper
customer_crawler_map = {'mp.weixin.qq.com': mp_scarper}
```
注意键使用域名,可以使用 `urllib.parse` 获取:
```python
from urllib.parse import urlparse
parsed_url = urlparse("site's url")

View File

@ -0,0 +1,85 @@
# wiseflow Custom Parser Instructions
## Overview
wiseflow is committed to processing all pages through a universal process (an intelligent agent driven by large models that can autonomously use web scraping tools).
Currently, we use the popular web scraping framework Crawlee (playwright) for unified management in page acquisition. After practical testing, Crawlee performs well in terms of speed and compatibility, and has a robust task queue management module, so customizations are generally unnecessary for web page acquisition.
For page information parsing, wiseflow uses large models by default, but users can configure custom parsers for specific domains.
## Custom Parser Configuration Instructions
### 1. Scraper Function Definition
The Scraper should be a function (not a class).
### 2. Function Parameters
The function receives two input parameters (passed by the wiseflow framework):
- `html`: This is the rendered page HTML code obtained by wiseflow through Crawlee's playwright_crawler, of type `str`. The scraper can directly use libraries like `bs` and `parsel` for parsing;
- `url`: The URL address of the current page, of type `str` (only for special operations, can be ignored if not needed).
### 3. Function Return Values
The Scraper output is limited to three:
#### 3.1 `article`
The parsed page details, of type `dict`, with the following format (**note that 'content' is mandatory, others can be omitted, and extra key-value information will be ignored**):
```python
{
'url': ...,
'author': ...,
'publish_date': ...,
'screenshot': ...,
'content': ...(not empty)
}
```
- The types of the above values are all required to be `str`, with the date format being `YYYY-MM-DD`, and the screenshot being a **file path**, which can be a relative path to the core directory or an absolute path, with the file type being `png`.
**Note:**
1. `'content'` must be present and not empty, otherwise subsequent extraction cannot be triggered, and the article will be discarded. This is the only non-empty requirement;
2. `'author'` and `'publish_date'` should be included if possible, otherwise wiseflow will automatically use the domain corresponding to the demain and the current date.
#### 3.2 `links`
The links parsed from the corresponding page, the type can be `set` or `dict`:
- If it is a `set`, all will be added to the task queue.
- If it is a `dict`, llm will be called to select URLs worth adding to the task queue (based on your focus point), with the format of the `dict` as follows:
```python
{
'text': text information corresponding to the external link,
'url': url corresponding to the external link
}
```
wiseflow will use this as input to determine the links worth continuing to crawl using llm.
#### 3.3 `infos`
The list of noteworthy information extracted from the corresponding page, of type `list`, with elements being `dict`, in the following format:
```python
{
'tag': id of the focuspoint,
'content': specific info content
}
```
**Note that the id of the focuspoint must match the focus_points table in pb**
### 4. Register Custom Parser
Register in `core/custom_scraper/__init__.py`, for reference:
```python
from .mp import mp_scarper
customer_crawler_map = {'mp.weixin.qq.com': mp_scarper}
```
Note that the key uses the domain name, which can be obtained using `urllib.parse`:
```python
from urllib.parse import urlparse
parsed_url = urlparse("site's url")
domain = parsed_url.netloc
```

View File

@ -3,18 +3,37 @@
from bs4 import BeautifulSoup
from datetime import datetime
import os, re
from utils.general_utils import get_logger
import logging
project_dir = os.environ.get("PROJECT_DIR", "")
if project_dir:
os.makedirs(project_dir, exist_ok=True)
mp_logger = get_logger('mp_scraper', project_dir)
log_formatter = logging.Formatter(fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# create logger and set level to debug
logger = logging.getLogger('mp_scraper')
logger.handlers = []
logger.setLevel('DEBUG')
logger.propagate = False
# create file handler and set level to debug
file = os.path.join(project_dir, 'mp_scraper.log')
file_handler = logging.FileHandler(file, 'a', encoding='utf-8')
file_handler.setLevel('INFO')
file_handler.setFormatter(log_formatter)
logger.addHandler(file_handler)
# create console handler and set level to info
console_handler = logging.StreamHandler()
console_handler.setLevel('DEBUG')
console_handler.setFormatter(log_formatter)
logger.addHandler(console_handler)
async def mp_scraper(html: str, url: str) -> tuple[dict, set, list]:
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
mp_logger.warning(f'{url} is not a mp url, you should not use this function')
logger.warning(f'{url} is not a mp url, you should not use this function')
return {}, set(), []
url = url.replace("http://", "https://", 1)
@ -50,12 +69,12 @@ async def mp_scraper(html: str, url: str) -> tuple[dict, set, list]:
else soup.find('h1', class_='rich_media_title').text.strip()
profile_nickname = soup.find('div', class_='wx_follow_nickname').text.strip()
except Exception as e:
mp_logger.warning(f"not mp format: {url}\n{e}")
logger.warning(f"not mp format: {url}\n{e}")
# For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two
return {}, set(), []
if not rich_media_title or not profile_nickname:
mp_logger.warning(f"failed to analysis {url}, no title or profile_nickname")
logger.warning(f"failed to analysis {url}, no title or profile_nickname")
return {}, set(), []
# Parse text and image links within the content interval
@ -72,7 +91,7 @@ async def mp_scraper(html: str, url: str) -> tuple[dict, set, list]:
cleaned_texts = [t for t in texts if t.strip()]
content = '\n'.join(cleaned_texts)
else:
mp_logger.warning(f"failed to analysis contents {url}")
logger.warning(f"failed to analysis contents {url}")
return {}, set(), []
if content:
content = f"[from {profile_nickname}]{content}"

View File

@ -10,7 +10,7 @@ from custom_scraper import custom_scraper_map
from urllib.parse import urlparse, urljoin
import hashlib
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
from datetime import datetime
from datetime import datetime, timedelta
project_dir = os.environ.get("PROJECT_DIR", "")
@ -55,6 +55,8 @@ async def save_to_pb(article: dict, infos: list):
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
# max_requests_per_crawl=1,
max_request_retries=2,
request_handler_timeout=timedelta(minutes=5),
headless=False if os.environ.get("VERBOSE", "").lower() in ["true", "1"] else True
)
@crawler.router.default_handler
@ -85,8 +87,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
wiseflow_logger.warning(f'{parsed_url} handled by customer scraper, bot got nothing')
return
title = article.get('title', "")
link_dict = more_urls if isinstance(more_urls, dict) else None
#title = article.get('title', "")
link_dict = more_urls if isinstance(more_urls, dict) else {}
related_urls = more_urls if isinstance(more_urls, set) else set()
if not infos and not related_urls:
text = article.pop('content') if 'content' in article else None
@ -127,7 +129,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
author = soup.find('div', class_='source').get_text(strip=True) if soup.find('div', class_='source') else None
# get infos by llm
infos, related_urls, author, publish_date = await gie(text, link_dict, base_url, author, publish_date)
title = await context.page.title()
# title = await context.page.title()
screenshot_file_name = f"{hashlib.sha256(context.request.url.encode()).hexdigest()}.png"
await context.page.screenshot(path=os.path.join(screenshot_dir, screenshot_file_name), full_page=True)
@ -136,7 +138,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
if infos:
article = {
'url': context.request.url,
'title': title,
# 'title': title,
'author': author,
'publish_date': publish_date,
'screenshot': os.path.join(screenshot_dir, screenshot_file_name),

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
download https://pocketbase.io/docs/
download https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/
```bash
cd pb