diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index abf0f8c..0000000 --- a/.dockerignore +++ /dev/null @@ -1,20 +0,0 @@ -.git -.vscode -.dockerignore -.gitignore -.env -config -build -web/dist -web/node_modules -docker-compose.yaml -Dockerfile -README.md -core/__pycache__ -core/work_dir -env_sample -core/pb/pb_data -core/pb/CHANGELOG.md -core/pb/LICENSE.md -core/pb/pocketbase -work_dir \ No newline at end of file diff --git a/.gitignore b/.gitignore index 07236d6..0ca3b9f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,9 +6,7 @@ __pycache__ .env .venv/ -core/pb/pb_data/ -core/pb/CHANGELOG.md -core/pb/LICENSE.md -core/pb/pocketbase -core/work_dir/ -/work_dir \ No newline at end of file +pb/pb_data/ +pb/pocketbase +/work_dir/ +/docker_dir/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..49308d5 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,24 @@ +# V0.3.5 +- 引入 Crawlee(playwrigt模块),大幅提升通用爬取能力,适配实际项目场景; + + Introduce Crawlee (playwright module), significantly enhancing general crawling capabilities and adapting to real-world task; + +- 完全重写了信息提取模块,引入“爬-查一体”策略,你关注的才是你想要的; + + Completely rewrote the information extraction module, introducing an "integrated crawl-search" strategy, focusing on what you care about; + +- 新策略下放弃了 gne、jieba 等模块,去除了安装包; + + Under the new strategy, modules such as gne and jieba have been abandoned, reducing the installation package size; + +- 重写了 pocketbase 的表单结构; + + Rewrote the PocketBase form structure; + +- llm wrapper引入异步架构、自定义页面提取器规范优化(含 微信公众号文章提取优化); + + llm wrapper introduces asynchronous architecture, customized page extractor specifications optimization (including WeChat official account article extraction optimization); + +- 进一步简化部署操作步骤。 + + Further simplified deployment steps. diff --git a/Dockerfile b/Dockerfile index 114bc4b..6bc63dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,22 @@ FROM python:3.10-slim RUN apt-get update && \ - apt-get install -yq tzdata build-essential unzip && \ - apt-get clean + apt-get install -y tzdata build-essential unzip +COPY core/requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt +RUN playwright install +RUN playwright install-deps WORKDIR /app -COPY core/requirements.txt requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -COPY core . - # download and unzip PocketBase -ADD https://github.com/pocketbase/pocketbase/releases/download/v0.22.13/pocketbase_0.22.13_linux_amd64.zip /tmp/pb.zip +ADD https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/pocketbase_0.23.4_linux_amd64.zip /tmp/pb.zip # for arm device -# ADD https://github.com/pocketbase/pocketbase/releases/download/v0.22.13/pocketbase_0.22.13_linux_arm64.zip /tmp/pb.zip -RUN unzip /tmp/pb.zip -d /app/pb/ +# ADD https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/pocketbase_0.23.4_linux_arm64.zip /tmp/pb.zip +RUN unzip /tmp/pb.zip -d /pb/ +RUN apt-get clean && rm -rf /var/lib/apt/lists/* EXPOSE 8090 -EXPOSE 8077 +# EXPOSE 8077 CMD tail -f /dev/null \ No newline at end of file diff --git a/README.md b/README.md index 02f267a..885ab22 100644 --- a/README.md +++ b/README.md @@ -6,70 +6,38 @@ **我们缺的不是信息,而是从海量信息中过滤噪音,从而让有价值的信息显露出来** -## 🔥 V0.3.8版本预告 - -wiseflow 预计将在2024.12月底前正式升级到0.3.8版本,这也将是 V0.3.x 架构下的最终版本(除非有足够多的小修改,否则不会有 V0.3.9版本) - -计划中的升级内容包括: - -- 大幅升级 general_crawler(引入诸多最新开源技术方案), 进一步提升页面适配覆盖度以及实现完全的本地 CPU 计算(意味着无需再为此配置 LLM 选项); -- 改进general_crawler 从列表页面提取 url 的能力,以及列表页面与普通文章页面的区分能力; -- 尝试引入新的 mp_crawler, 公众号文章监控无需wxbot; -- 测试并推荐新的信息提取 llm model,并微调提取策略。 - -视情况可能添加的特性: - -- 引入对 RSS 信息源的支持; -- 引入对社交平台的支持(初期这一块会十分简陋,请不要太多期待) - -上述内容会逐步提前释放到 dev 分支,欢迎切换尝鲜,并积极反馈 issue。 - ------------------------------ - 🌱看看首席情报官是如何帮您节省时间,过滤无关信息,并整理关注要点的吧!🌱 -- ✅ 通用网页内容解析器,综合使用统计学习(依赖开源项目GNE)和LLM,适配90%以上的新闻页面; -- ✅ 异步任务架构; -- ✅ 使用LLM进行信息提取和标签分类(最低只需使用9B大小的LLM就可完美执行任务)! -https://github.com/TeamWiseFlow/wiseflow/assets/96130569/bd4b2091-c02d-4457-9ec6-c072d8ddfb16 -sample.png +## 🔥 隆重介绍 V0.3.5 版本 -## ✋ wiseflow 与常见的爬虫工具、AI搜索、知识库(RAG)项目有何不同? +在充分听取社区反馈意见基础之上,我们重新提炼了 wiseflow 的产品定位,新定位更加聚焦,V0.3.5版本即是该定位下的全新架构版本: -承蒙大家的厚爱,wiseflow自2024年6月底发布 V0.3.0版本来受到了开源社区的广泛关注,甚至吸引了不少自媒体的主动报道,在此首先表示感谢! +- 引入 [Crawlee](https://github.com/apify/crawlee-python) 作为基础爬虫和任务管理框架,大幅提升页面获取能力。实测之前获取不到(包括获取为乱码的)页面目前都可以很好的获取了,后续大家碰到不能很好获取的页面,欢迎在 [issue #136](https://github.com/TeamWiseFlow/wiseflow/issues/136) 中进行反馈; +- 新产品定位下全新的信息提取策略——“爬查一体”,放弃文章详细提取,爬取过程中即使用 llm 直接提取用户感兴趣的信息(infos),同时自动判断值得跟进爬取的链接,**你关注的才是你需要的**; +- 适配最新版本(v0.23.4)的 Pocketbase,同时更新表单配置。另外新架构已经无需 GNE 等模块,requirement 依赖项目降低到8个; +- 新架构部署方案也更加简便,docker 模式支持代码仓热更新,这意味着后续升级就无需再重复docker build了。 +- 更多细节,参考 [CHANGELOG](CHANGELOG.md) -但我们也注意到部分关注者对 wiseflow 的功能定位存在一些理解偏差,为免误会,我们制作了如下表格,清晰展示 wiseflow 与爬虫、AI搜索、知识库(RAG)类项目的对比: +🌟 **V0.3.x 后续计划** -| | **首席情报官(Wiseflow)** | +- 引入 [SeeAct](https://github.com/OSU-NLP-Group/SeeAct) 方案,通过视觉大模型指导复杂页面的操作,如滚动、点击后出现信息等情况(V0.3.6); +- 尝试支持微信公众号免wxbot订阅(V0.3.7); +- 引入对 RSS 信息源的支持(V0.3.8); +- 尝试引入 LLM 驱动的轻量级知识图谱,帮助用户从 infos 中建立洞察(V0.3.9)。 + +## ✋ wiseflow 与传统的爬虫工具、AI搜索、知识库(RAG)项目有何不同? + +wiseflow自2024年6月底发布 V0.3.0版本来受到了开源社区的广泛关注,甚至吸引了不少自媒体的主动报道,在此首先表示感谢! + +但我们也注意到部分关注者对 wiseflow 的功能定位存在一些理解偏差,如下表格通过与传统爬虫工具、AI搜索、知识库(RAG)类项目的对比,代表了我们目前对于 wiseflow 产品最新定位思考。 + +| | 与 **首席情报官(Wiseflow)** 的比较说明| |-------------|-----------------| -| **爬虫类工具** | wiseflow 集成了很多优秀的开源爬虫工具,并增加了基于 LLM 的自动化信息过滤、筛选与分类能力,所以可以简单认为 wiseflow = 爬虫工具 + AI | -| **AI搜索** | AI搜索主要的应用场景是**具体问题的即时问答**,举例:”XX公司的创始人是谁“、“xx品牌下的xx产品哪里有售” ;wiseflow主要的应用场景是**某一方面信息的持续采集**,比如XX公司的关联信息追踪,XX品牌市场行为的持续追踪……在这些场景下,用户只能提供关注点(某公司、某品牌),但无法提出具体搜索问题,且并不是一次检索,而是需要持续追踪,或者自动化进行关联追踪,您可以简单的把wiseflow理解为一个可持续自动进行 ai 搜索的“智能体”,即 “AI 情报官” | -| **知识库(RAG)类项目** | 知识库(RAG)类项目一般是基于已有信息的下游任务,并且一般面向的是私有知识(比如企业内的操作手册、产品手册、政府部门的文件等);wiseflow 目前并未整合下游任务,同时面向的是互联网上的公开信息 | - -## 🔄 V0.3.1 更新 - -dashboard 部分已经删除,如果您有dashboard需求,请下载 [V0.2.1版本](https://github.com/TeamWiseFlow/wiseflow/releases/tag/V0.2.1) - -👏 虽然部分9b大小的LLM(THUDM/glm-4-9b-chat)已经可以实现稳定的信息提取输出,但是我们发现对于复杂含义的tag(比如“党建”)或者需要特指的tag(比如仅需采集“居民区活动”,而不希望包括诸如演唱会这样的大型活动信息), -使用目前的prompt还是不能进行准确的提取,因此我们在这一版本中为每个tag增加了explaination字段,可以通过输入该字段进行更加清晰的tag指定。 - - _注:复杂explaination需要更大规模的模型才能准确理解,具体见 [模型推荐 2024-09-03](###-4. 模型推荐 [2024-09-03])_ - -👏 另外针对上一版本prompt语言选择的问题(虽然这并不影响输出结果),我们在目前版本中进一步简化了方案,用户无需指定系统语言(这在docker中并不那么直观),系统会根据tag以及tag的explaination判断选择何种语言的 -prompt(也就决定了info的输出语言),这进一步简化了wiseflow的部署和使用。【不过目前wiseflow仅支持简体中文和英文两种语言,其他语言的需求可以通过更改 core/insights/get_info.py 中的prompt实现】 - -## 🌟 如何在您的应用中整合wiseflow - -wiseflow是一个原生的LLM应用,仅需7B~9B大小LLM就可以很好的执行信息挖掘、过滤与分类任务,且无需向量模型,系统开销很小,适合各种硬件环境下的本地化以及私有化部署。 - -wiseflow将挖掘出的信息存储于自带的Pocketbase数据库中,这意味着这种情况下您无需了解wiseflow的代码,只需要对数据库进行读取操作即可! - -PocketBase作为流行的轻量级数据库,目前已有 Go/Javascript/Python 等语言的SDK。 - - Go : https://pocketbase.io/docs/go-overview/ - - Javascript : https://pocketbase.io/docs/js-overview/ - - python : https://github.com/vaphes/pocketbase +| **爬虫类工具** | 首先 wiseflow 是基于爬虫工具的项目(以目前版本而言,我们基于爬虫框架 Crawlee),但传统的爬虫工具在信息提取方面需要人工的介入,提供明确的 Xpath 等信息……这不仅阻挡了普通用户,同时也毫无通用性可言,对于不同网站(包括已有网站升级后)都需要人工重做分析,并更新提取代码。wiseflow致力于使用 LLM 自动化网页的分析和提取工作,用户只要告诉程序他的关注点即可,从这个角度来说,可以简单理解 wiseflow 为 “能自动使用爬虫工具的 AI 智能体” | +| **AI搜索** | AI搜索主要的应用场景是**具体问题的即时问答**,举例:”XX公司的创始人是谁“、“xx品牌下的xx产品哪里有售” ,用户要的是**一个答案**;wiseflow主要的应用场景是**某一方面信息的持续采集**,比如XX公司的关联信息追踪,XX品牌市场行为的持续追踪……在这些场景下,用户能提供关注点(某公司、某品牌)、甚至能提供信源(站点 url 等),但无法提出具体搜索问题,用户要的是**一系列相关信息**| +| **知识库(RAG)类项目** | 知识库(RAG)类项目一般是基于已有信息的下游任务,并且一般面向的是私有知识(比如企业内的操作手册、产品手册、政府部门的文件等);wiseflow 目前并未整合下游任务,同时面向的是互联网上的公开信息,如果从“智能体”的角度来看,二者属于为不同目的而构建的智能体,RAG 类项目是“(内部)知识助理智能体”,而 wiseflow 则是“(外部)信息采集智能体”| ## 📥 安装与使用 @@ -79,121 +47,156 @@ PocketBase作为流行的轻量级数据库,目前已有 Go/Javascript/Python ```bash git clone https://github.com/TeamWiseFlow/wiseflow.git -cd wiseflow ``` -### 2. 推荐使用docker运行 +### 2. 参考 env_sample 配置 .env 文件放置在 core 目录下 -**中国区用户使用前请合理配置网络,或者指定docker hub镜像** +🌟 **这里与之前版本不同**,V0.3.5开始需要把 .env 放置在 core文件夹中。 + +另外 V0.3.5 起,env 配置也大幅简化了,必须的配置项目只有三项,具体如下: + +- LLM_API_KEY="" + + 大模型服务key,这是必须的 + +- LLM_API_BASE="https://api.siliconflow.cn/v1" + + 服务接口地址,任何支持 openai sdk 的服务商都可以,如果直接使用openai 的服务,这一项也可以不填 + +- PB_API_AUTH="test@example.com|1234567890" + + pocketbase 数据库的 superuser 用户名和密码,记得用 | 分隔 + +下面的都是可选配置: +- #VERBOSE="true" + + 是否开启观测模式,开启的话,不仅会把 debug log信息记录在 logger 文件上(默认仅输出在 console 上),同时会开启 playwright 的浏览器窗口,方便观察抓取过程; + +- #PRIMARY_MODEL="Qwen/Qwen2.5-7B-Instruct" + + 主模型选择,在使用 siliconflow 服务的情况下,这一项不填就会默认调用Qwen2.5-7B-Instruct,实测基本也够用,但我更加**推荐 Qwen2.5-14B-Instruct** + +- #SECONDARY_MODEL="THUDM/glm-4-9b-chat" + + 副模型选择,在使用 siliconflow 服务的情况下,这一项不填就会默认调用glm-4-9b-chat。 + +- #PROJECT_DIR="work_dir" + + 项目运行数据目录,不配置的话,默认在 `core/work_dir` ,注意:目前整个 core 目录是挂载到 container 下的,所以意味着你可以直接访问这里。 + +- #PB_API_BASE="" + + 只有当你的 pocketbase 不运行在默认ip 或端口下才需要配置,默认情况下忽略就行。 + +### 3.1 使用docker运行 + +✋ V0.3.5版本架构和依赖与之前版本有较大不同,请务必重新拉取代码,删除旧版本镜像(包括外挂的 pb_data 文件夹),重新build! + +对于国内用户,可以先配置镜像源: + +最新可用 docker 镜像加速地址参考:[参考1](https://github.com/dongyubin/DockerHub) [参考2](https://www.coderjia.cn/archives/dba3f94c-a021-468a-8ac6-e840f85867ea) + +**三方镜像,风险自担。** + +之后 ```bash +cd wiseflow docker compose up ``` **注意:** - - 在wiseflow代码仓根目录下运行上述命令; - - 运行前先创建并编辑.env文件放置在Dockerfile同级目录(wiseflow代码仓根目录),.env文件可以参考env_sample - - 第一次运行docker container时会遇到报错,这其实是正常现象,因为你尚未为pb仓库创建admin账号。 - -此时请保持container不关闭状态,浏览器打开`http://127.0.0.1:8090/_/ `,按提示创建admin账号(一定要使用邮箱),然后将创建的admin邮箱(再次强调,一定要用邮箱)和密码填入.env文件,重启container即可。 -_如您想更改container的时区和语言,请仿照如下命令运行image_ - -```bash -docker run -e LANG=zh_CN.UTF-8 -e LC_CTYPE=zh_CN.UTF-8 your_image -``` - -### 2.【备选】直接使用python运行 +第一次运行docker container时程序可能会报错,这是正常现象,请按屏幕提示创建 super user 账号(一定要使用邮箱),然后将创建的用户名密码填入.env文件,重启container即可。 + +🌟 docker方案默认运行 task.py ,即会周期性执行爬取-提取任务(启动时会立即先执行一次,之后每隔一小时启动一次) + +### 3.2 使用python环境运行 + +✋ V0.3.5版本架构和依赖与之前版本有较大不同,请务必重新拉取代码,删除(或重建)pb_data + +推荐使用 conda 构建虚拟环境 ```bash +cd wiseflow conda create -n wiseflow python=3.10 conda activate wiseflow cd core pip install -r requirements.txt ``` -之后可以参考core/scripts 中的脚本分别启动pb、task和backend (将脚本文件移动到core目录下) - -**注意:** - - 一定要先启动pb,至于task和backend是独立进程,先后顺序无所谓,也可以按需求只启动其中一个; - - 需要先去这里 https://pocketbase.io/docs/ 下载对应自己设备的pocketbase客户端,并放置在 /core/pb 目录下 - - pb运行问题(包括首次运行报错等)参考 [core/pb/README.md](/core/pb/README.md) - - 使用前请创建并编辑.env文件,放置在wiseflow代码仓根目录(core目录的上级),.env文件可以参考env_sample,详细配置说明见下 +之后去这里 [下载](https://pocketbase.io/docs/) 对应的 pocketbase 客户端,放置到 [/pb](/pb) 目录下。然后 -📚 for developer, see [/core/README.md](/core/README.md) for more - -通过 pocketbase 访问获取的数据: - - http://127.0.0.1:8090/_/ - Admin dashboard UI - - http://127.0.0.1:8090/api/ - REST API - +```bash +chmod +x run.sh +./run_task.sh # if you just want to scan sites one-time (no loop), use ./run.sh +``` -### 3. 配置 +这个脚本会自动判断 pocketbase 是否已经在运行,如果未运行,会自动拉起。但是请注意,当你 ctrl+c 或者 ctrl+z 终止进程时,pocketbase 进程不会被终止,直到你关闭terminal。 -复制目录下的env_sample,并改名为.env, 参考如下 填入你的配置信息(LLM服务token等) - -**windows用户如果选择直接运行python程序,可以直接在 “开始 - 设置 - 系统 - 关于 - 高级系统设置 - 环境变量“ 中设置如下项目,设置后需要重启终端生效** +另外与 docker 部署一样,第一次运行时可能会出现报错,请按屏幕提示创建 super user 账号(一定要使用邮箱),然后将创建的用户名密码填入.env文件,再次运行即可。 - - LLM_API_KEY # 大模型推理服务API KEY - - LLM_API_BASE # 本项目依赖openai sdk,只要模型服务支持openai接口,就可以通过配置该项正常使用,如使用openai服务,删除这一项即可 - - WS_LOG="verbose" # 设定是否开始debug观察,如无需要,删除即可 - - GET_INFO_MODEL # 信息提炼与标签匹配任务模型,默认为 gpt-4o-mini-2024-07-18 - - REWRITE_MODEL # 近似信息合并改写任务模型,默认为 gpt-4o-mini-2024-07-18 - - HTML_PARSE_MODEL # 网页解析模型(GNE算法效果不佳时智能启用),默认为 gpt-4o-mini-2024-07-18 - - PROJECT_DIR # 数据、缓存以及日志文件存储位置,相对于代码仓的相对路径,默认不填就在代码仓 - - PB_API_AUTH='email|password' # pb数据库admin的邮箱和密码(注意一定是邮箱,可以是虚构的邮箱) - - PB_API_BASE # 正常使用无需这一项,只有当你不使用默认的pocketbase本地接口(8090)时才需要 - - -### 4. 模型推荐 [2024-09-03] +当然你也可以在另一个 terminal 提前运行并设定 pocketbase(这会避免第一次的报错),具体可以参考 [pb/README.md](/pb/README.md) -经过反复测试(中英文任务)**GET_INFO_MODEL**、**REWRITE_MODEL**、**HTML_PARSE_MODEL** 三项最小可用模型分别为:**"THUDM/glm-4-9b-chat"**、**"Qwen/Qwen2-7B-Instruct"**、**"Qwen/Qwen2-7B-Instruct"** - -目前,SiliconFlow已经官宣Qwen2-7B-Instruct、glm-4-9b-chat在线推理服务免费,这意味着您可以“零成本”使用wiseflow啦! +### 4. 模型推荐 [2024-12-09] + +虽然参数量越大的模型意味着更佳的性能,但经过实测,**使用 Qwen2.5-7b-Instruct 和 glm-4-9b-chat 模型,即可以达到基本的效果**。不过综合考虑成本、速度和效果,我更加推荐主模型 +**(PRIMARY_MODEL)使用Qwen2.5-14B-Instruct**。 + +这里依然强烈推荐使用 siliconflow(硅基流动)的 MaaS 服务,提供多个主流开源模型的服务,量大管饱,Qwen2.5-7b-Instruct 和 glm-4-9b-chat 目前提供免费服务。(主模型使用Qwen2.5-14B-Instruct情况下,爬取374个网页,有效抽取43条 info,总耗费¥3.07) -😄 如果您愿意,可以使用我的[siliconflow邀请链接](https://cloud.siliconflow.cn?referrer=clx6wrtca00045766ahvexw92),这样我也可以获得更多token奖励 😄 +😄 如果您愿意,可以使用我的[siliconflow邀请链接](https://cloud.siliconflow.cn?referrer=clx6wrtca00045766ahvexw92),这样我也可以获得更多token奖励 🌹 -⚠️ **V0.3.1更新** - -如果您使用带explaination的复杂tag,那么glm-4-9b-chat规模的模型是无法保证准确理解的,目前测试下来针对该类型任务效果比较好的模型为 **Qwen/Qwen2-72B-Instruct** 和 **gpt-4o-mini-2024-07-18** 。 +**如果您的信源多为非中文页面,且也不要求提取出的 info 为中文,那么更推荐您使用 openai 或者 claude 等海外厂家的模型。** -针对有需求使用 `gpt-4o-mini-2024-07-18` 的用户,可以尝试第三方代理 **AiHubMix**,支持国内网络环境直连、支付宝充值(实际费率相当于官网86折) +您可以尝试第三方代理 **AiHubMix**,支持国内网络环境直连、支付宝便捷支付,免去封号风险; -🌹 欢迎使用如下邀请链接 [AiHubMix邀请链接](https://aihubmix.com?aff=Gp54) 注册 🌹 +😄 欢迎使用如下邀请链接 [AiHubMix邀请链接](https://aihubmix.com?aff=Gp54) 注册 🌹 -🌍 上述两个平台的在线推理服务均兼容openai SDK,配置`.env `的`LLM_API_BASE`和`LLM_API_KEY`后即可使用。 +🌟 **请注意 wiseflow 本身并不限定任何模型服务,只要服务兼容 openAI SDK 即可,包括本地部署的 ollama、Xinference 等服务** ### 5. **关注点和定时扫描信源添加** 启动程序后,打开pocketbase Admin dashboard UI (http://127.0.0.1:8090/_/) -#### 5.1 打开 tags表单 +#### 5.1 打开 focus_point 表单 通过这个表单可以指定你的关注点,LLM会按此提炼、过滤并分类信息。 -tags 字段说明: - - name, 关注点名称 - - explaination,关注点的详细解释或具体约定,如 “仅限上海市官方发布的初中升学信息”(tag name为 上海初中升学信息) - - activated, 是否激活。如果关闭则会忽略该关注点,关闭后可再次开启。开启和关闭无需重启docker容器,会在下一次定时任务时更新。 +字段说明: +- focuspoint, 关注点描述(必填),如”上海小升初信息“、”加密货币价格“ +- explanation,关注点的详细解释或具体约定,如 “仅限上海市官方发布的初中升学信息”、“BTC、ETH 的现价、涨跌幅数据“等 +- activated, 是否激活。如果关闭则会忽略该关注点,关闭后可再次开启。 + +注意:focus_point 更新设定(包括 activated 调整)后,**需要重启程序才会生效。** #### 5.2 打开 sites表单 通过这个表单可以指定自定义信源,系统会启动后台定时任务,在本地执行信源扫描、解析和分析。 sites 字段说明: - - url, 信源的url,信源无需给定具体文章页面,给文章列表页面即可。 - - per_hours, 扫描频率,单位为小时,类型为整数(1~24范围,我们建议扫描频次不要超过一天一次,即设定为24) - - activated, 是否激活。如果关闭则会忽略该信源,关闭后可再次开启。开启和关闭无需重启docker容器,会在下一次定时任务时更新。 +- url, 信源的url,信源无需给定具体文章页面,给文章列表页面即可。 +- per_hours, 扫描频率,单位为小时,类型为整数(1~24范围,我们建议扫描频次不要超过一天一次,即设定为24) +- activated, 是否激活。如果关闭则会忽略该信源,关闭后可再次开启。 + +**sites 的设定调整,无需重启程序。** -### 6. 本地部署 +## 📚 如何在您自己的程序中使用 wiseflow 抓取出的数据 -如您所见,本项目最低仅需使用7b\9b大小的LLM,且无需任何向量模型,这就意味着仅仅需要一块3090RTX(24G显存)就可以完全的对本项目进行本地化部署。 - -请保证您的本地化部署LLM服务兼容openai SDK,并配置 LLM_API_BASE 即可。 +1、参考 [dashbord](dashboard) 部分源码二次开发。 -注:若需让7b~9b规模的LLM可以实现对tag explaination的准确理解,推荐使用dspy进行prompt优化,但这需要累积约50条人工标记数据。详见 [DSPy](https://dspy-docs.vercel.app/) +注意 wiseflow 的 core 部分并不需要 dashboard,目前产品也未集成 dashboard,如果您有dashboard需求,请下载 [V0.2.1版本](https://github.com/TeamWiseFlow/wiseflow/releases/tag/V0.2.1) +2、直接从 Pocketbase 中获取数据 + +wiseflow 所有抓取数据都会即时存入 pocketbase,因此您可以直接操作 pocketbase 数据库来获取数据。 + +PocketBase作为流行的轻量级数据库,目前已有 Go/Javascript/Python 等语言的SDK。 + - Go : https://pocketbase.io/docs/go-overview/ + - Javascript : https://pocketbase.io/docs/js-overview/ + - python : https://github.com/vaphes/pocketbase ## 🛡️ 许可协议 @@ -201,19 +204,22 @@ sites 字段说明: 商用以及定制合作,请联系 **Email:35252986@qq.com** - - 商用客户请联系我们报备登记,产品承诺永远免费。 +- 商用客户请联系我们报备登记,产品承诺永远免费。 ## 📬 联系方式 -有任何问题或建议,欢迎通过 [issue](https://github.com/TeamWiseFlow/wiseflow/issues) 与我们联系。 +有任何问题或建议,欢迎通过 [issue](https://github.com/TeamWiseFlow/wiseflow/issues) 留言。 ## 🤝 本项目基于如下优秀的开源项目: -- GeneralNewsExtractor ( General Extractor of News Web Page Body Based on Statistical Learning) https://github.com/GeneralNewsExtractor/GeneralNewsExtractor +- crawlee-python (A web scraping and browser automation library for Python to build reliable crawlers. Works with BeautifulSoup, Playwright, and raw HTTP. Both headful and headless mode. With proxy rotation.) https://github.com/apify/crawlee-python - json_repair(Repair invalid JSON documents ) https://github.com/josdejong/jsonrepair/tree/main - python-pocketbase (pocketBase client SDK for python) https://github.com/vaphes/pocketbase +- SeeAct(a system for generalist web agents that autonomously carry out tasks on any given website, with a focus on large multimodal models (LMMs) such as GPT-4Vision.) https://github.com/OSU-NLP-Group/SeeAct + +同时受 [GNE](https://github.com/GeneralNewsExtractor/GeneralNewsExtractor)、[AutoCrawler](https://github.com/kingname/AutoCrawler) 启发。 ## Citation @@ -223,4 +229,4 @@ sites 字段说明: Author:Wiseflow Team https://github.com/TeamWiseFlow/wiseflow Licensed under Apache2.0 -``` +``` \ No newline at end of file diff --git a/README_KR.md b/README_KR.md index fb7b881..e504349 100644 --- a/README_KR.md +++ b/README_KR.md @@ -117,7 +117,7 @@ pip install -r requirements.txt **주의:** - 반드시 pb를 먼저 시작해야 하며, task와 backend는 독립적인 프로세스이므로 순서는 상관없고, 필요에 따라 하나만 시작해도 됩니다. - 먼저 여기를 방문하여 https://pocketbase.io/docs/ 본인의 장치에 맞는 pocketbase 클라이언트를 다운로드하고 /core/pb 디렉토리에 배치해야 합니다. - - pb 실행 문제(처음 실행 시 오류 포함)에 대해서는 [core/pb/README.md](/core/pb/README.md)를 참조하십시오. + - pb 실행 문제(처음 실행 시 오류 포함)에 대해서는 [core/pb/README.md](/pb/README.md)를 참조하십시오. - 사용 전에 .env 파일을 생성하고 편집하여 wiseflow 코드 저장소의 루트 디렉토리(core 디렉토리의 상위)에 배치하십시오. .env 파일은 env_sample을 참고하고, 자세한 설정 설명은 아래를 참조하십시오. 📚 개발자를 위한 더 많은 정보는 [/core/README.md](/core/README.md)를 참조하십시오. diff --git a/asset/sample.png b/asset/sample.png deleted file mode 100644 index 5fc04ea..0000000 Binary files a/asset/sample.png and /dev/null differ diff --git a/compose.yaml b/compose.yaml index 5decf91..09033f6 100755 --- a/compose.yaml +++ b/compose.yaml @@ -5,12 +5,10 @@ services: image: wiseflow:latest tty: true stdin_open: true - entrypoint: bash docker_entrypoint.sh - env_file: - - .env + entrypoint: ["bash", "/app/docker_entrypoint.sh"] ports: - 8090:8090 - - 8077:8077 volumes: - - ./${PROJECT_DIR}/pb_data:/app/pb/pb_data - - ./${PROJECT_DIR}:/app/${PROJECT_DIR} \ No newline at end of file + - ./core:/app + - ./pb/pb_data:/pb/pb_data + - ./pb/pb_migrations:/pb/pb_migrations \ No newline at end of file diff --git a/core/README.md b/core/README.md deleted file mode 100644 index ae5aefc..0000000 --- a/core/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# For Developer Only - -```bash -conda create -n wiseflow python=3.10 -conda activate wiseflow -cd core -pip install -r requirements.txt -``` - -- tasks.py background task circle process -- backend.py main process pipeline service (based on fastapi) - -### WiseFlow fastapi detail - -- api address http://127.0.0.1:8077/feed -- request method : post -- body : - -```python -{'user_id': str, 'type': str, 'content':str, 'addition': Optional[str]} -# Type is one of "text", "publicMsg", "site" and "url"; -# user_id: str -type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"] -content: str -addition: Optional[str] = None -``` - -see more (when backend started) http://127.0.0.1:8077/docs - -### WiseFlow Repo File Structure - -``` -wiseflow -|- dockerfiles -|- ... -|- core - |- tasks.py - |- backend.py - |- insights - |- __init__.py # main process - |- get_info.py # module use llm to get a summary of information and match tags - |- llms # llm service wrapper - |- pb # pocketbase filefolder - |- scrapers - |- __init__.py # You can register a proprietary site scraper here - |- general_scraper.py # module to get all possible article urls for general site - |- general_crawler.py # module for general article sites - |- mp_crawler.py # module for mp article (weixin public account) sites - |- utils # tools -``` - -Although the general_scraper included in wiseflow can be applied to the parsing of most static pages, for actual business, we still recommend that customers to write their own crawlers aiming the actual info source. - -See core/scrapers/README.md for integration instructions for proprietary crawlers diff --git a/core/agents/__init__.py b/core/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/agents/get_info.py b/core/agents/get_info.py new file mode 100644 index 0000000..5922bc4 --- /dev/null +++ b/core/agents/get_info.py @@ -0,0 +1,250 @@ +from llms.openai_wrapper import openai_llm as llm +# from core.llms.siliconflow_wrapper import sfa_llm +from utils.general_utils import is_chinese, extract_and_convert_dates, extract_urls +from loguru import logger +from utils.pb_api import PbTalker +import os, re +from datetime import datetime +from urllib.parse import urlparse +import json_repair + + +class GeneralInfoExtractor: + def __init__(self, pb: PbTalker, _logger: logger) -> None: + self.pb = pb + self.logger = _logger + self.model = os.environ.get("PRIMARY_MODEL", "Qwen/Qwen2.5-7B-Instruct") # better to use "Qwen/Qwen2.5-14B-Instruct" + self.secondary_model = os.environ.get("SECONDARY_MODEL", "THUDM/glm-4-9b-chat") + + # collect tags user set in pb database and determin the system prompt language based on tags + focus_data = pb.read(collection_name='focus_points', filter=f'activated=True') + if not focus_data: + self.logger.info('no activated tag found, will ask user to create one') + focus = input('It seems you have not set any focus point, WiseFlow need the specific focus point to guide the following info extract job.' + 'so please input one now. describe what info you care about shortly: ') + explanation = input('Please provide more explanation for the focus point (if not necessary, pls just type enter: ') + focus_data.append({"focuspoint": focus, "explanation": explanation, + "id": pb.add('focus_points', {"focuspoint": focus, "explanation": explanation})}) + + # self.focus_list = [item["focuspoint"] for item in focus_data] + self.focus_dict = {item["focuspoint"]: item["id"] for item in focus_data} + focus_statement = '' + for item in focus_data: + tag = item["focuspoint"] + expl = item["explanation"] + focus_statement = f"{focus_statement}#{tag}\n" + if expl: + focus_statement = f"{focus_statement}解释:{expl}\n" + + if is_chinese(focus_statement): + self.get_info_prompt = f'''作为信息提取助手,你的任务是从给定的网页文本中提取与以下用户兴趣点相关的内容。兴趣点列表及其解释如下: + +{focus_statement}\n +在进行信息提取时,请遵循以下原则: + +- 理解每个兴趣点的含义,确保提取的内容与之相关。 +- 如果兴趣点有进一步的解释,确保提取的内容符合这些解释的范围。 +- 忠于原文,你的任务是从网页文本中识别和提取与各个兴趣点相关的信息,并不是总结和提炼。 +- 不管给定的原文是何种语言,请保证使用中文输出你的提取结果。 + +另外请注意给定的网页文本是通过爬虫程序从html代码中提取出来的,所以请忽略里面不必要的空格、换行符等。''' + self.get_info_suffix = '''如果上述网页文本中包含兴趣点相关的内容,请按照以下json格式输出提取的信息(文本中可能包含多条有用信息,请不要遗漏): +[{"focus": 兴趣点名称, "content": 提取的内容}] + +示例: +[{"focus": "旅游景点", "content": "北京故宫,地址:北京市东城区景山前街4号,开放时间:8:30-17:00"}, {"focus": "美食推荐", "content": "来王府井小吃街必吃北京烤鸭、炸酱面"}] + +如果网页文本中不包含任何与兴趣点相关的信息,请仅输出:[]。''' + self.get_more_link_prompt = f"作为一位高效的信息筛选助手,你的任务是根据给定的兴趣点,从给定的文本及其对应的URL中挑选出最值得关注的URL。兴趣点及其解释如下:\n\n{focus_statement}" + self.get_more_link_suffix = '''请逐条分析,先逐一给出分析依据,最终将挑选出的 url 按一行一条的格式输出,最终输出的 url 列表整体用三引号包裹,三引号内不要有其他内容,如下是输出格式示例: +""" +url1 +url2 +... +"""''' + else: + self.get_info_prompt = f'''As an information extraction assistant, your task is to extract content related to the following user focus points from the given web page text. The list of focus points and their explanations is as follows: + +{focus_statement}\n +When extracting information, please follow the principles below: + +- Understand the meaning of each focus point and ensure that the extracted content is relevant to it. +- If a focus point has further explanations, ensure that the extracted content conforms to the scope of these explanations. +- Stay true to the original text; your task is to identify and extract information related to each focus point from the web page text, not to summarize or refine it. + +Please note that the given web page text is extracted from HTML code via a crawler, so please ignore any unnecessary spaces, line breaks, etc.''' + self.get_info_suffix = '''If the above webpage text contains content related to points of interest, please output the extracted information in the following JSON format (the text may contain multiple useful pieces of information, do not miss any): +[{"focus": "Point of Interest Name", "content": "Extracted Content"}] + +Example: +[{"focus": "Tourist Attraction", "content": "The Forbidden City, Beijing, Address: No. 4 Jingshan Front Street, Dongcheng District, Opening Hours: 8:30-17:00"}, {"focus": "Food Recommendation", "content": "Must-try at Wangfujing Snack Street: Beijing Roast Duck, Noodles with Soybean Paste"}] + +If the webpage text does not contain any information related to points of interest, please output only: []''' + self.get_more_link_prompt = f"As an efficient information filtering assistant, your task is to select the most noteworthy URLs from a set of texts and their corresponding URLs based on the given focus points. The focus points and their explanations are as follows:\n\n{focus_statement}" + self.get_more_link_suffix = '''Please analyze one by one, first give the analysis basis one by one, and finally output the selected URLs in a row-by-row format. The final output URL list is wrapped in three quotes as a whole, and there should be no other content in the three quotes. Here is an example of the output format: +""" +url1 +url2 +... +"""''' + + async def get_author_and_publish_date(self, text: str) -> tuple[str, str]: + if not text: + return "NA", "NA" + + if len(text) > 1024: + text = f'{text[:500]}......{text[-500:]}' + + system_prompt = "As an information extraction assistant, your task is to accurately extract the source (or author) and publication date from the given webpage text. It is important to adhere to extracting the information directly from the original text. If the original text does not contain a particular piece of information, please replace it with NA" + suffix = '''Please output the extracted information in the following JSON format: +{"source": source or article author (use "NA" if this information cannot be extracted), "publish_date": extracted publication date (keep only the year, month, and day; use "NA" if this information cannot be extracted)}''' + + content = f'\n{text}\n\n\n{suffix}' + llm_output = await llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': content}], + model=self.secondary_model, max_tokens=50, temperature=0.1, response_format={"type": "json_object"}) + + self.logger.debug(f'get_author_and_publish_date llm output:\n{llm_output}') + if not llm_output: + return '', '' + result = json_repair.repair_json(llm_output, return_objects=True) + self.logger.debug(f"decoded_object: {result}") + if not isinstance(result, dict): + self.logger.warning("failed to parse from llm output") + return '', '' + if 'source' not in result or 'publish_date' not in result: + self.logger.warning("failed to parse from llm output") + return '', '' + + return result['source'], extract_and_convert_dates(result['publish_date']) + + async def get_more_related_urls(self, link_dict: dict, og_url: str) -> set[str]: + if not link_dict: + return set() + self.logger.debug(f'{len(link_dict)} items to analyze') + urls = set() + content = '' + for key, value in link_dict.items(): + content = f"{content}{key}: {value}\n" + if len(content) > 512: + result = await llm([{'role': 'system', 'content': self.get_more_link_prompt}, + {'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}], + model=self.model, temperature=0.1) + self.logger.debug(f'get_more_related_urls llm output:\n{result}') + result = re.findall(r'"""(.*?)"""', result, re.DOTALL) + if result: + result = result[0].strip() + # self.logger.debug(f"cleaned output: {result}") + urls.update(extract_urls(result)) + content = '' + + if content: + result = await llm([{'role': 'system', 'content': self.get_more_link_prompt}, + {'role': 'user', 'content': f'{content}\n{self.get_more_link_suffix}'}], + model=self.model, temperature=0.1) + self.logger.debug(f'get_more_related_urls llm output:\n{result}') + result = re.findall(r'"""(.*?)"""', result, re.DOTALL) + if result: + result = result[0].strip() + # self.logger.debug(f"cleaned output: {result}") + urls.update(extract_urls(result)) + + raw_urls = set(link_dict.values()) + urls.discard(og_url) + hallucination_urls = urls - raw_urls + if hallucination_urls: + self.logger.warning(f"{hallucination_urls} not in link_dict, it's model's Hallucination") + + return urls & raw_urls + + async def get_info(self, text: str, info_pre_fix: str, link_dict: dict) -> list[dict]: + if not text: + return [] + + content = f'\n{text}\n\n\n{self.get_info_suffix}' + result = await llm([{'role': 'system', 'content': self.get_info_prompt}, {'role': 'user', 'content': content}], + model=self.model, temperature=0.1, response_format={"type": "json_object"}) + self.logger.debug(f'get_info llm output:\n{result}') + if not result: + return [] + + result = json_repair.repair_json(result, return_objects=True) + if not isinstance(result, list): + self.logger.warning("failed to parse from llm output") + return [] + if not result: + self.logger.debug("no info found") + return [] + + system = '''判断给定的信息是否与网页文本相符。信息将用标签包裹,网页文本则用包裹。请遵循如下工作流程: +1、尝试找出网页文本中所有与信息对应的文本片段(可能有多处); +2、基于这些片段给出是否相符的最终结论,最终结论仅为“是”或“否”''' + suffix = '先输出找到的所有文本片段,再输出最终结论(仅为是或否)' + + final = [] + for item in result: + if 'focus' not in item or 'content' not in item: + self.logger.warning(f"not quality item: {item}, it's model's Hallucination") + continue + if item['focus'] not in self.focus_dict: + self.logger.warning(f"{item['focus']} not in focus_list, it's model's Hallucination") + continue + if not item['content']: + continue + + if item['content'] in link_dict: + self.logger.debug(f"{item['content']} in link_dict, aborting") + continue + + judge = await llm([{'role': 'system', 'content': system}, + {'role': 'user', 'content': f'\n{item["content"]}\n\n\n\n{text}\n\n\n{suffix}'}], + model=self.secondary_model, temperature=0.1) + self.logger.debug(f'judge llm output:\n{judge}') + if not judge: + self.logger.warning("failed to parse from llm output, skip checking") + final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{item['content']}"}) + continue + + to_save = False + for i in range(min(7, len(judge))): + char = judge[-1 - i] + if char == '是': + to_save = True + break + elif char == '否': + break + if not to_save: + self.logger.info(f"secondary model judge {item} not faithful to article text, aborting") + continue + final.append({'tag': self.focus_dict[item['focus']], 'content': f"{info_pre_fix}{item['content']}"}) + + if not final: + self.logger.info("no quality result from llm output") + return final + + async def __call__(self, text: str, link_dict: dict, base_url: str, author: str = None, publish_date: str = None) -> tuple[list, set, str, str]: + if not author and not publish_date and text: + author, publish_date = await self.get_author_and_publish_date(text) + + if not author or author.lower() == 'na': + author = urlparse(base_url).netloc + + if not publish_date or publish_date.lower() == 'na': + publish_date = datetime.now().strftime('%Y-%m-%d') + + related_urls = await self.get_more_related_urls(link_dict, base_url) + + info_prefix = f"//{author} {publish_date}//" + lines = text.split('\n') + text = '' + infos = [] + for line in lines: + text = f'{text}{line}' + if len(text) > 2048: + cache = await self.get_info(text, info_prefix, link_dict) + infos.extend(cache) + text = '' + if text: + cache = await self.get_info(text, info_prefix, link_dict) + infos.extend(cache) + + return infos, related_urls, author, publish_date diff --git a/core/agents/insights.py b/core/agents/insights.py new file mode 100644 index 0000000..e69de29 diff --git a/core/agents/seeact.py b/core/agents/seeact.py new file mode 100644 index 0000000..6ef168a --- /dev/null +++ b/core/agents/seeact.py @@ -0,0 +1,5 @@ +# future plan +# inspired by https://github.com/OSU-NLP-Group/SeeAct +# use a visual-llm to extract the main content and determine next action + +# input a playwright page object \ No newline at end of file diff --git a/core/custom_scraper/README.md b/core/custom_scraper/README.md new file mode 100644 index 0000000..e16ff2c --- /dev/null +++ b/core/custom_scraper/README.md @@ -0,0 +1,83 @@ +# wiseflow 自定义解析器说明 + +## 概述 +wiseflow 致力于通过一套通用流程(使用大模型驱动的可以自主使用爬虫工具的智能体)处理所有页面。 + +目前在页面获取方面我们使用流行的爬虫框架 Crawlee(playwright)进行统一管理,经过实测 Crawlee 在速度和兼容性方面都非常不错,且有着完善的任务队列管理模块,因此网页获取方面一般无需自定义。 + +对于页面信息的解析,wiseflow 默认使用大模型,但用户可以为特定域名配置自定义解析器。 + +## 自定义解析器配置说明 + +### 1. Scraper 函数定义 +Scraper 应该是一个函数(而不是类)。 + +### 2. 函数参数 +该函数接收两个入参(wiseflow 框架传入): +- `html`:这是 wiseflow 通过 Crawlee 的 playwright_crawler 获取到的渲染后的页面 html 代码,类型为 `str`,scraper 可以直接使用 `bs` `parsel`等库进行解析; +- `url`:当前页面的 url 地址,类型为 `str`(仅是为了特殊操作,用不到的话可以直接忽略)。 + +### 3. 函数返回值 +Scraper 出参限定为三个: + +#### 3.1 `article` +解析出的页面详情,类型为 `dict`,格式如下: + +```python +{ + 'author': ..., + 'publish_date': ..., + 'content': ... +} +``` + +- 上述值的类型都要求为 `str`,日期格式为 `YYYY-MM-DD`。 + +**注意:** +1. `'content'` 要有且不为空,不然无法触发后续的提取; +2. `'author'` 和 `'publish_date'` 尽量有,不然 wiseflow 会自动用域名对应 demain 和 当日日期代替。 + +#### 3.2 `links` +对应页面解析出的链接,类型可以是 `set`,也可以是 `dict`: + +- 如果是 `set`,则会全部被加入任务队列。 +- 如果是 `dict`,则会调用 llm 从中挑取值得加入任务队列的 url(根据你的 focus point),`dict` 的格式如下: + +```python +{ + 'text': 外链对应的文字信息, + 'url': 外链对应的 url +} +``` + +wiseflow 会以这个为输入,使用 llm 判断值得继续爬取的链接。 + +#### 3.3 `infos` +对应页面抽取出的值得关注的信息列表,类型是 `list`,元素为 `dict`,格式为: + +```python +{ + 'tag': focuspoint 的 id, + 'content': 具体 info 内容 +} +``` + +**注意,focuspoint 的 id 要和 pb 中 focus_points 表一致** + +### 4. 注册自定义解析器 +在 `core/custom_scraper/__init__.py` 中注册,参考: + +```python +from .mp import mp_scarper + +customer_crawler_map = {'mp.weixin.qq.com': mp_scarper} +``` + +注意键使用域名,可以使用 `urllib.parse` 获取: + +```python +from urllib.parse import urlparse + +parsed_url = urlparse("site's url") +domain = parsed_url.netloc +``` \ No newline at end of file diff --git a/core/custom_scraper/README_EN.md b/core/custom_scraper/README_EN.md new file mode 100644 index 0000000..a8e3559 --- /dev/null +++ b/core/custom_scraper/README_EN.md @@ -0,0 +1,83 @@ +# wiseflow Custom Parser Instructions + +## Overview +wiseflow is committed to processing all pages through a universal process (an intelligent agent driven by large models that can autonomously use web scraping tools). + +Currently, we use the popular web scraping framework Crawlee (playwright) for unified management in page acquisition. After practical testing, Crawlee performs well in terms of speed and compatibility, and has a robust task queue management module, so customizations are generally unnecessary for web page acquisition. + +For page information parsing, wiseflow uses large models by default, but users can configure custom parsers for specific domains. + +## Custom Parser Configuration Instructions + +### 1. Scraper Function Definition +The Scraper should be a function (not a class). + +### 2. Function Parameters +The function receives two input parameters (passed by the wiseflow framework): +- `html`: This is the rendered page HTML code obtained by wiseflow through Crawlee's playwright_crawler, of type `str`. The scraper can directly use libraries like `bs` and `parsel` for parsing; +- `url`: The URL address of the current page, of type `str` (only for special operations, can be ignored if not needed). + +### 3. Function Return Values +The Scraper output is limited to three: + +#### 3.1 `article` +The parsed page details, of type `dict`, with the following format: + +```python +{ + 'author': ..., + 'publish_date': ..., + 'content': ... +} +``` + +- The types of the above values are all required to be `str`, with the date format being `YYYY-MM-DD`, and the screenshot being a **file path**, which can be a relative path to the core directory or an absolute path, with the file type being `png`. + +**Note:** +1. `'content'` must be present and not empty, otherwise subsequent extraction cannot be triggered; +2. `'author'` and `'publish_date'` should be included if possible, otherwise wiseflow will automatically use the domain corresponding to the demain and the current date. + +#### 3.2 `links` +The links parsed from the corresponding page, the type can be `set` or `dict`: + +- If it is a `set`, all will be added to the task queue. +- If it is a `dict`, llm will be called to select URLs worth adding to the task queue (based on your focus point), with the format of the `dict` as follows: + +```python +{ + 'text': text information corresponding to the external link, + 'url': url corresponding to the external link +} +``` + +wiseflow will use this as input to determine the links worth continuing to crawl using llm. + +#### 3.3 `infos` +The list of noteworthy information extracted from the corresponding page, of type `list`, with elements being `dict`, in the following format: + +```python +{ + 'tag': id of the focuspoint, + 'content': specific info content +} +``` + +**Note that the id of the focuspoint must match the focus_points table in pb** + +### 4. Register Custom Parser +Register in `core/custom_scraper/__init__.py`, for reference: + +```python +from .mp import mp_scarper + +customer_crawler_map = {'mp.weixin.qq.com': mp_scarper} +``` + +Note that the key uses the domain name, which can be obtained using `urllib.parse`: + +```python +from urllib.parse import urlparse + +parsed_url = urlparse("site's url") +domain = parsed_url.netloc +``` \ No newline at end of file diff --git a/core/custom_scraper/__init__.py b/core/custom_scraper/__init__.py new file mode 100644 index 0000000..ded2693 --- /dev/null +++ b/core/custom_scraper/__init__.py @@ -0,0 +1,3 @@ +from .mp import mp_scraper + +custom_scraper_map = {'mp.weixin.qq.com': mp_scraper} diff --git a/core/custom_scraper/mp.py b/core/custom_scraper/mp.py new file mode 100644 index 0000000..21f6e8d --- /dev/null +++ b/core/custom_scraper/mp.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +from bs4 import BeautifulSoup +from datetime import datetime +import os, re +import logging + + +project_dir = os.environ.get("PROJECT_DIR", "") +if project_dir: + os.makedirs(project_dir, exist_ok=True) + +log_formatter = logging.Formatter(fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# create logger and set level to debug +logger = logging.getLogger('mp_scraper') +logger.handlers = [] +logger.setLevel('DEBUG') +logger.propagate = False + +# create file handler and set level to debug +file = os.path.join(project_dir, 'mp_scraper.log') +file_handler = logging.FileHandler(file, 'a', encoding='utf-8') +file_handler.setLevel('INFO') +file_handler.setFormatter(log_formatter) +logger.addHandler(file_handler) + +# create console handler and set level to info +console_handler = logging.StreamHandler() +console_handler.setLevel('DEBUG') +console_handler.setFormatter(log_formatter) +logger.addHandler(console_handler) + +async def mp_scraper(html: str, url: str) -> tuple[dict, set, list]: + if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'): + logger.warning(f'{url} is not a mp url, you should not use this function') + return {}, set(), [] + + url = url.replace("http://", "https://", 1) + soup = BeautifulSoup(html, 'html.parser') + + if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'): + # 文章目录 + urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')} + simple_urls = set() + for url in urls: + cut_off_point = url.find('chksm=') + if cut_off_point != -1: + url = url[:cut_off_point - 1] + simple_urls.add(url) + return {}, simple_urls, [] + + # Get the original release date first + pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'" + match = re.search(pattern, html) + if match: + publish_time = match.group(1) + else: + publish_time = datetime.strftime(datetime.today(), "%Y-%m-%d") + + # Get description content from < meta > tag + try: + meta_description = soup.find('meta', attrs={'name': 'description'}) + summary = meta_description['content'].strip() if meta_description else '' + # card_info = soup.find('div', id='img-content') + # Parse the required content from the < div > tag + rich_media_title = soup.find('h1', id='activity-name').text.strip() \ + if soup.find('h1', id='activity-name') \ + else soup.find('h1', class_='rich_media_title').text.strip() + profile_nickname = soup.find('div', class_='wx_follow_nickname').text.strip() + except Exception as e: + logger.warning(f"not mp format: {url}\n{e}") + # For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two + return {}, set(), [] + + if not rich_media_title or not profile_nickname: + logger.warning(f"failed to analysis {url}, no title or profile_nickname") + return {}, set(), [] + + # Parse text and image links within the content interval + # because the structure of this part is completely different, and a separate analysis scheme needs to be written + # (but the proportion of this type of article is not high). + texts = [] + content_area = soup.find('div', id='js_content') + if content_area: + # 提取文本 + for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section + text = section.get_text(separator=' ', strip=True) + if text and text not in texts: + texts.append(text) + cleaned_texts = [t for t in texts if t.strip()] + content = '\n'.join(cleaned_texts) + else: + logger.warning(f"failed to analysis contents {url}") + return {}, set(), [] + if content: + content = f"[from {profile_nickname}]{content}" + else: + # If the content does not have it, but the summary has it, it means that it is a mp of the picture sharing type. + # At this time, you can use the summary as the content. + content = f"[from {profile_nickname}]{summary}" + + article = {'author': profile_nickname, + 'publish_date': publish_time, + 'content': content} + + return article, set(), [] diff --git a/core/docker_entrypoint.sh b/core/docker_entrypoint.sh index 8a0d57d..5ff5654 100755 --- a/core/docker_entrypoint.sh +++ b/core/docker_entrypoint.sh @@ -1,4 +1,23 @@ #!/bin/bash -exec pb/pocketbase serve --http=0.0.0.0:8090 & -exec python tasks.py & -exec uvicorn backend:app --reload --host 0.0.0.0 --port 8077 \ No newline at end of file + +set -o allexport +source .env +set +o allexport + +# 启动 PocketBase +/pb/pocketbase serve --http=127.0.0.1:8090 & +pocketbase_pid=$! + +# 启动 Python 任务 +python tasks.py & +python_pid=$! + +# 启动 Uvicorn +# uvicorn backend:app --reload --host 0.0.0.0 --port 8077 & +# uvicorn_pid=$! + +# 定义信号处理函数 +trap 'kill $pocketbase_pid $python_pid' SIGINT SIGTERM + +# 等待所有进程结束 +wait \ No newline at end of file diff --git a/core/general_process.py b/core/general_process.py new file mode 100644 index 0000000..da6931a --- /dev/null +++ b/core/general_process.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- +from utils.pb_api import PbTalker +from utils.general_utils import get_logger, extract_and_convert_dates +from agents.get_info import GeneralInfoExtractor +from bs4 import BeautifulSoup +import os +import json +import asyncio +from custom_scraper import custom_scraper_map +from urllib.parse import urlparse, urljoin +import hashlib +from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavigationContext +from datetime import datetime, timedelta + + +project_dir = os.environ.get("PROJECT_DIR", "") +if project_dir: + os.makedirs(project_dir, exist_ok=True) + +os.environ['CRAWLEE_STORAGE_DIR'] = os.path.join(project_dir, 'crawlee_storage') +screenshot_dir = os.path.join(project_dir, 'crawlee_storage', 'screenshots') +wiseflow_logger = get_logger('general_process', project_dir) +pb = PbTalker(wiseflow_logger) +gie = GeneralInfoExtractor(pb, wiseflow_logger) +existing_urls = {url['url'] for url in pb.read(collection_name='infos', fields=['url'])} + + +async def save_to_pb(url: str, infos: list): + # saving to pb process + for info in infos: + info['url'] = url + _ = pb.add(collection_name='infos', body=info) + if not _: + wiseflow_logger.error('add info failed, writing to cache_file') + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + with open(os.path.join(project_dir, f'{timestamp}_cache_infos.json'), 'w', encoding='utf-8') as f: + json.dump(info, f, ensure_ascii=False, indent=4) + + +crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + # max_requests_per_crawl=1, + max_request_retries=2, + request_handler_timeout=timedelta(minutes=5), + headless=False if os.environ.get("VERBOSE", "").lower() in ["true", "1"] else True +) + +@crawler.pre_navigation_hook +async def log_navigation_url(context: PlaywrightPreNavigationContext) -> None: + context.log.info(f'Navigating to {context.request.url} ...') + +@crawler.router.default_handler +async def request_handler(context: PlaywrightCrawlingContext) -> None: + # context.log.info(f'Processing {context.request.url} ...') + # Handle dialogs (alerts, confirms, prompts) + async def handle_dialog(dialog): + context.log.info(f'Closing dialog: {dialog.message}') + await dialog.accept() + + context.page.on('dialog', handle_dialog) + await context.page.wait_for_load_state('networkidle') + html = await context.page.inner_html('body') + context.log.info('successfully finish fetching') + + parsed_url = urlparse(context.request.url) + domain = parsed_url.netloc + if domain in custom_scraper_map: + context.log.info(f'routed to customer scraper for {domain}') + try: + article, more_urls, infos = await custom_scraper_map[domain](html, context.request.url) + if not article and not infos and not more_urls: + wiseflow_logger.warning(f'{parsed_url} handled by customer scraper, bot got nothing') + except Exception as e: + context.log.error(f'error occurred: {e}') + wiseflow_logger.warning(f'handle {parsed_url} failed by customer scraper, so no info can be found') + article, infos, more_urls = {}, [], set() + + link_dict = more_urls if isinstance(more_urls, dict) else {} + related_urls = more_urls if isinstance(more_urls, set) else set() + if not infos and not related_urls: + try: + text = article.get('content', '') + except Exception as e: + wiseflow_logger.warning(f'customer scraper output article is not valid dict: {e}') + text = '' + + if not text: + wiseflow_logger.warning(f'no content found in {parsed_url} by customer scraper, cannot use llm GIE, aborting') + infos, related_urls = [], set() + else: + author = article.get('author', '') + publish_date = article.get('publish_date', '') + # get infos by llm + try: + infos, related_urls, author, publish_date = await gie(text, link_dict, context.request.url, author, publish_date) + except Exception as e: + wiseflow_logger.error(f'gie error occurred in processing: {e}') + infos, related_urls = [], set() + else: + # Extract data from the page. + # future work: try to use a visual-llm do all the job... + text = await context.page.inner_text('body') + soup = BeautifulSoup(html, 'html.parser') + links = soup.find_all('a', href=True) + base_url = f"{parsed_url.scheme}://{domain}" + link_dict = {} + for a in links: + new_url = a.get('href') + if new_url.startswith('javascript:') or new_url.startswith('#') or new_url.startswith('mailto:'): + continue + if new_url in [context.request.url, base_url]: + continue + if new_url in existing_urls: + continue + t = a.text.strip() + if new_url and t: + link_dict[t] = urljoin(base_url, new_url) + existing_urls.add(new_url) + + publish_date = soup.find('div', class_='date').get_text(strip=True) if soup.find('div', class_='date') else None + if publish_date: + publish_date = extract_and_convert_dates(publish_date) + author = soup.find('div', class_='author').get_text(strip=True) if soup.find('div', class_='author') else None + if not author: + author = soup.find('div', class_='source').get_text(strip=True) if soup.find('div', class_='source') else None + # get infos by llm + infos, related_urls, author, publish_date = await gie(text, link_dict, base_url, author, publish_date) + + if infos: + await save_to_pb(context.request.url, infos) + + if related_urls: + await context.add_requests(list(related_urls)) + + # todo: use llm to determine next action + """ + screenshot_file_name = f"{hashlib.sha256(context.request.url.encode()).hexdigest()}.png" + await context.page.screenshot(path=os.path.join(screenshot_dir, screenshot_file_name), full_page=True) + wiseflow_logger.debug(f'screenshot saved to {screenshot_file_name}') + """ + +if __name__ == '__main__': + sites = pb.read('sites', filter='activated=True') + wiseflow_logger.info('execute all sites one time') + async def run_all_sites(): + await crawler.run([site['url'].rstrip('/') for site in sites]) + + asyncio.run(run_all_sites()) diff --git a/core/insights/__init__.py b/core/insights/__init__.py deleted file mode 100644 index 9da29ff..0000000 --- a/core/insights/__init__.py +++ /dev/null @@ -1,162 +0,0 @@ -# -*- coding: utf-8 -*- - -from scrapers.general_crawler import general_crawler -from utils.general_utils import extract_urls, compare_phrase_with_list -from .get_info import get_info, pb, project_dir, logger, info_rewrite -import os -import json -from datetime import datetime, timedelta -import re -import asyncio -from typing import Dict - - -# The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg -item_pattern = re.compile(r'(.*?)', re.DOTALL) -url_pattern = re.compile(r'') -summary_pattern = re.compile(r'', re.DOTALL) -extensions = ('.pdf', '.docx', '.xlsx', '.doc', '.ppt', '.pptx', '.xls', '.txt', '.jpg', '.jpeg', '.png', '.gif', '.bmp', - '.tiff', '.mp4', '.avi', '.wmv', '.mkv', '.flv', '.wav', '.mp3', '.avi', '.mov', '.wmv', '.mpeg', '.mpg', - '.3gp', '.ogg', '.webm', '.m4a', '.aac', '.flac', '.wma', '.amr', '.ogg', '.m4v', '.m3u8', '.m3u', '.ts', - '.mts') -expiration_days = 3 -existing_urls = {url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']} - - -async def pipeline(url: str, cache: Dict[str, str] = {}): - working_list = {url} - while working_list: - url = working_list.pop() - existing_urls.add(url) - if any(url.endswith(ext) for ext in extensions): - logger.info(f"{url} is a file, skip") - continue - logger.debug(f"start processing {url}") - - # get article process - flag, result = await general_crawler(url, logger) - if flag == 1: - logger.info('get new url list, add to work list') - new_urls = result - existing_urls - working_list.update(new_urls) - continue - elif flag <= 0: - logger.error("got article failed, pipeline abort") - continue - - expiration = datetime.now() - timedelta(days=expiration_days) - expiration_date = expiration.strftime('%Y-%m-%d') - article_date = int(result['publish_time']) - if article_date < int(expiration_date.replace('-', '')): - logger.info(f"publish date is {article_date}, too old, skip") - continue - - for k, v in cache.items(): - if v: - result[k] = v - - # get info process - logger.debug(f"article: {result['title']}") - article_id = pb.add(collection_name='articles', body=result) - if not article_id: - logger.error('add article failed, writing to cache_file') - with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=4) - continue - - insights = get_info(f"title: {result['title']}\n\ncontent: {result['content']}") - if not insights: - continue - - # post process - article_tags = set() - old_insights = pb.read(collection_name='insights', filter=f"updated>'{expiration_date}'", - fields=['id', 'tag', 'content', 'articles']) - for insight in insights: - article_tags.add(insight['tag']) - insight['articles'] = [article_id] - old_insight_dict = {i['content']: i for i in old_insights if i['tag'] == insight['tag']} - - # the result wanted is whether the extracted information phrases are talking about the same thing, - # it may not be suitable and too heavy to calculate the similarity with a vector model - # Therefore, a simplified solution is used here, directly using jieba to calculate whether the overlap between the two phrases exceeds. - similar_insights = compare_phrase_with_list(insight['content'], list(old_insight_dict.keys()), 0.65) - if similar_insights: - to_rewrite = similar_insights + [insight['content']] - new_info_content = info_rewrite(to_rewrite) - if not new_info_content: - continue - insight['content'] = new_info_content - # Merge related articles and delete old insights - for old_insight in similar_insights: - insight['articles'].extend(old_insight_dict[old_insight]['articles']) - if not pb.delete(collection_name='insights', id=old_insight_dict[old_insight]['id']): - logger.error('delete insight failed') - old_insights.remove(old_insight_dict[old_insight]) - - insight['id'] = pb.add(collection_name='insights', body=insight) - if not insight['id']: - logger.error('add insight failed, writing to cache_file') - with open(os.path.join(project_dir, 'cache_insights.json'), 'a', encoding='utf-8') as f: - json.dump(insight, f, ensure_ascii=False, indent=4) - - _ = pb.update(collection_name='articles', id=article_id, body={'tag': list(article_tags)}) - if not _: - logger.error(f'update article failed - article_id: {article_id}') - result['tag'] = list(article_tags) - with open(os.path.join(project_dir, 'cache_articles.json'), 'a', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=4) - - -async def message_manager(_input: dict): - source = _input['user_id'] - logger.debug(f"received new task, user: {source}, Addition info: {_input['addition']}") - if _input['type'] == 'publicMsg': - items = item_pattern.findall(_input["content"]) - # Iterate through all < item > content, extracting < url > and < summary > - for item in items: - url_match = url_pattern.search(item) - url = url_match.group(1) if url_match else None - if not url: - logger.warning(f"can not find url in \n{item}") - continue - # URL processing, http is replaced by https, and the part after chksm is removed. - url = url.replace('http://', 'https://') - cut_off_point = url.find('chksm=') - if cut_off_point != -1: - url = url[:cut_off_point-1] - if url in existing_urls: - logger.debug(f"{url} has been crawled, skip") - continue - summary_match = summary_pattern.search(item) - summary = summary_match.group(1) if summary_match else None - cache = {'source': source, 'abstract': summary} - await pipeline(url, cache) - - elif _input['type'] == 'text': - urls = extract_urls(_input['content']) - if not urls: - logger.debug(f"can not find any url in\n{_input['content']}\npass...") - # todo get info from text process - return - await asyncio.gather(*[pipeline(url) for url in urls if url not in existing_urls]) - - elif _input['type'] == 'url': - # this is remained for wechat shared mp_article_card - item = re.search(r'(.*?)&chksm=', _input["content"], re.DOTALL) - if not item: - logger.debug("shareUrlOpen not find") - item = re.search(r'(.*?)&chksm=', _input["content"], re.DOTALL) - if not item: - logger.debug("shareUrlOriginal not find") - item = re.search(r'(.*?)&chksm=', _input["content"], re.DOTALL) - if not item: - logger.warning(f"cannot find url in \n{_input['content']}") - return - extract_url = item.group(1).replace('amp;', '') - summary_match = re.search(r'(.*?)', _input["content"], re.DOTALL) - summary = summary_match.group(1) if summary_match else None - cache = {'source': source, 'abstract': summary} - await pipeline(extract_url, cache) - else: - return diff --git a/core/insights/get_info.py b/core/insights/get_info.py deleted file mode 100644 index e70b3aa..0000000 --- a/core/insights/get_info.py +++ /dev/null @@ -1,151 +0,0 @@ -from llms.openai_wrapper import openai_llm -# from llms.siliconflow_wrapper import sfa_llm -import re -from utils.general_utils import get_logger_level, is_chinese -from loguru import logger -from utils.pb_api import PbTalker -import os - - -get_info_model = os.environ.get("GET_INFO_MODEL", "gpt-4o-mini-2024-07-18") -rewrite_model = os.environ.get("REWRITE_MODEL", "gpt-4o-mini-2024-07-18") - -project_dir = os.environ.get("PROJECT_DIR", "") -if project_dir: - os.makedirs(project_dir, exist_ok=True) -logger_file = os.path.join(project_dir, 'wiseflow.log') -dsw_log = get_logger_level() -logger.add( - logger_file, - level=dsw_log, - backtrace=True, - diagnose=True, - rotation="50 MB" -) - -pb = PbTalker(logger) - -focus_data = pb.read(collection_name='tags', filter=f'activated=True') -if not focus_data: - logger.error('no activated tag found, please set at least one') - exit(1) - -focus_list = [item["name"] for item in focus_data if item["name"]] -focus_dict = {item["name"]: item["id"] for item in focus_data if item["name"]} -lang_term = ''.join([f'{item["name"]}{item["explaination"]}' for item in focus_data if item["name"]]) -focus_statement = '\n'.join([f'{item["name"]}{item["explaination"]}' for item in focus_data if item["name"] and item["explaination"]]) - -if is_chinese(lang_term): - if focus_statement: - system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型标签列表进行分析。类型标签列表如下: -{focus_list} - -各标签的含义如下: -{focus_statement} - -如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型标签,并提供仅包含时间、地点、人物和事件的一句话信息摘要: -类型名称仅包含时间、地点、人物和事件的一句话信息摘要 - -务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的标签,不要重复输出;3、如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。''' - else: - system_prompt = f'''请仔细阅读用户输入的新闻内容,并根据所提供的类型标签列表进行分析。类型标签列表如下: -{focus_list} - -如果新闻中包含上述任何类型的信息,请使用以下格式标记信息的类型标签,并提供仅包含时间、地点、人物和事件的一句话信息摘要: -类型名称仅包含时间、地点、人物和事件的一句话信息摘要 - -务必注意:1、严格忠于新闻原文,不得提供原文中不包含的信息;2、对于同一事件,仅选择一个最贴合的标签,不要重复输出;3、如果新闻中包含多个信息,请逐一分析并按一条一行的格式输出,如果新闻不涉及任何类型的信息,则直接输出:无。''' - - rewrite_prompt = '''请综合给到的内容,提炼总结为一个新闻摘要。给到的内容会用XML标签分隔。请仅输出总结出的摘要,不要输出其他的信息。''' - -else: - if focus_statement: - system_prompt = f'''Please carefully read the news content provided by the user and analyze it according to the list of type labels given below: -{focus_list} - -The meanings of each label are as follows: -{focus_statement} - -If the news contains any information of the aforementioned types, please mark the type label of the information using the following format and provide a one-sentence summary containing only the time, location, people involved, and event: -TypeLabelA one-sentence summary containing only the time, location, people involved, and event - -Please be sure to: 1. Strictly adhere to the original text and do not provide information not contained in the original; 2. For the same event, choose only one most appropriate label and do not repeat the output; 3. If the news contains multiple pieces of information, analyze them one by one and output them in a one-line-per-item format. If the news does not involve any of the types of information, simply output: None.''' - else: - system_prompt = f'''Please carefully read the news content provided by the user and analyze it according to the list of type labels given below: -{focus_list} - -If the news contains any information of the aforementioned types, please mark the type label of the information using the following format and provide a one-sentence summary containing only the time, location, people involved, and event: -TypeLabelA one-sentence summary containing only the time, location, people involved, and event - -Please be sure to: 1. Strictly adhere to the original text and do not provide information not contained in the original; 2. For the same event, choose only one most appropriate label and do not repeat the output; 3. If the news contains multiple pieces of information, analyze them one by one and output them in a one-line-per-item format. If the news does not involve any of the types of information, simply output: None.''' - - rewrite_prompt = "Please synthesize the content provided, which will be segmented by XML tags, into a news summary. Output only the summarized abstract without including any additional information." - - -def get_info(article_content: str) -> list[dict]: - # logger.debug(f'receive new article_content:\n{article_content}') - result = openai_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': article_content}], - model=get_info_model, logger=logger, temperature=0.1) - - # results = pattern.findall(result) - texts = result.split('') - texts = [_.strip() for _ in texts if '' in _.strip()] - if not texts: - logger.debug(f'can not find info, llm result:\n{result}') - return [] - - cache = [] - for text in texts: - try: - strings = text.split('') - tag = strings[0] - tag = tag.strip() - if tag not in focus_list: - logger.info(f'tag not in focus_list: {tag}, aborting') - continue - info = strings[1] - info = info.split('\n\n') - info = info[0].strip() - except Exception as e: - logger.info(f'parse error: {e}') - tag = '' - info = '' - - if not info or not tag: - logger.info(f'parse failed-{text}') - continue - - if len(info) < 7: - logger.info(f'info too short, possible invalid: {info}') - continue - - if info.startswith('无相关信息') or info.startswith('该新闻未提及') or info.startswith('未提及'): - logger.info(f'no relevant info: {text}') - continue - - while info.endswith('"'): - info = info[:-1] - info = info.strip() - - # 拼接下来源信息 - sources = re.findall(r'\[from (.*?)]', article_content) - if sources and sources[0]: - info = f"[from {sources[0]}] {info}" - - cache.append({'content': info, 'tag': focus_dict[tag]}) - - return cache - - -def info_rewrite(contents: list[str]) -> str: - context = f"{''.join(contents)}" - try: - result = openai_llm([{'role': 'system', 'content': rewrite_prompt}, {'role': 'user', 'content': context}], - model=rewrite_model, temperature=0.1, logger=logger) - return result.strip() - except Exception as e: - if logger: - logger.warning(f'rewrite process llm generate failed: {e}') - else: - print(f'rewrite process llm generate failed: {e}') - return '' diff --git a/core/llms/openai_wrapper.py b/core/llms/openai_wrapper.py index ceb7a4e..0457d28 100644 --- a/core/llms/openai_wrapper.py +++ b/core/llms/openai_wrapper.py @@ -1,7 +1,7 @@ import os from openai import OpenAI from openai import RateLimitError -import time +import asyncio base_url = os.environ.get('LLM_API_BASE', "") @@ -10,34 +10,36 @@ token = os.environ.get('LLM_API_KEY', "") if not base_url and not token: raise ValueError("LLM_API_BASE or LLM_API_KEY must be set") elif base_url and not token: - client = OpenAI(base_url=base_url) + client = OpenAI(base_url=base_url, api_key="not_use") elif not base_url and token: client = OpenAI(api_key=token) else: client = OpenAI(api_key=token, base_url=base_url) +llm_lock = asyncio.Lock() -def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: +async def openai_llm(messages: list, model: str, logger=None, **kwargs) -> str: if logger: logger.debug(f'messages:\n {messages}') logger.debug(f'model: {model}') logger.debug(f'kwargs:\n {kwargs}') - try: - response = client.chat.completions.create(messages=messages, model=model, **kwargs) - except RateLimitError as e: - logger.warning(f'{e}\nRetrying in 60 second...') - time.sleep(60) - response = client.chat.completions.create(messages=messages, model=model, **kwargs) - if response and response.choices: - return response.choices[0].message.content - else: - logger.error(f'after many try, llm error: {response}') - return "" - except Exception as e: - if logger: - logger.error(f'openai_llm error: {e}') - return '' + async with llm_lock: + try: + response = client.chat.completions.create(messages=messages, model=model, **kwargs) + except RateLimitError as e: + logger.warning(f'{e}\nRetrying in 60 second...') + await asyncio.sleep(60) + response = client.chat.completions.create(messages=messages, model=model, **kwargs) + if response.status_code == 200 and response.choices: + return response.choices[0].message.content + else: + logger.error(f'after many try, llm error: {response}') + return "" + except Exception as e: + if logger: + logger.error(f'openai_llm error: {e}') + return '' if logger: logger.debug(f'result:\n {response.choices[0]}') diff --git a/core/pb/CHANGELOG.md b/core/pb/CHANGELOG.md deleted file mode 100644 index ab2136a..0000000 --- a/core/pb/CHANGELOG.md +++ /dev/null @@ -1,1016 +0,0 @@ -## v0.22.12 - -- Fixed calendar picker grid layout misalignment on Firefox ([#4865](https://github.com/pocketbase/pocketbase/issues/4865)). - -- Updated Go deps and bumped the min Go version in the GitHub release action to Go 1.22.3 since it comes with [some minor security fixes](https://github.com/golang/go/issues?q=milestone%3AGo1.22.3). - - -## v0.22.11 - -- Load the full record in the relation picker edit panel ([#4857](https://github.com/pocketbase/pocketbase/issues/4857)). - - -## v0.22.10 - -- Updated the uploaded filename normalization to take double extensions in consideration ([#4824](https://github.com/pocketbase/pocketbase/issues/4824)) - -- Added Collection models cache to help speed up the common List and View requests execution with ~25%. - _This was extracted from the ongoing work on [#4355](https://github.com/pocketbase/pocketbase/discussions/4355) and there are many other small optimizations already implemented but they will have to wait for the refactoring to be finalized._ - - -## v0.22.9 - -- Fixed Admin UI OAuth2 "Clear all fields" btn action to properly unset all form fields ([#4737](https://github.com/pocketbase/pocketbase/issues/4737)). - - -## v0.22.8 - -- Fixed '~' auto wildcard wrapping when the param has escaped `%` character ([#4704](https://github.com/pocketbase/pocketbase/discussions/4704)). - -- Other minor UI improvements (added `aria-expanded=true/false` to the dropdown triggers, added contrasting border around the default mail template btn style, etc.). - -- Updated Go deps and bumped the min Go version in the GitHub release action to Go 1.22.2 since it comes with [some `net/http` security and bug fixes](https://github.com/golang/go/issues?q=milestone%3AGo1.22.2). - - -## v0.22.7 - -- Replaced the default `s3blob` driver with a trimmed vendored version to reduce the binary size with ~10MB. - _It can be further reduced with another ~10MB once we replace entirely the `aws-sdk-go-v2` dependency but I stumbled on some edge cases related to the headers signing and for now is on hold._ - -- Other minor improvements (updated GitLab OAuth2 provider logo [#4650](https://github.com/pocketbase/pocketbase/pull/4650), normalized error messages, updated npm dependencies, etc.) - - -## v0.22.6 - -- Admin UI accessibility improvements: - - Fixed the dropdowns tab/enter/space keyboard navigation ([#4607](https://github.com/pocketbase/pocketbase/issues/4607)). - - Added `role`, `aria-label`, `aria-hidden` attributes to some of the elements in attempt to better assist screen readers. - - -## v0.22.5 - -- Minor test helpers fixes ([#4600](https://github.com/pocketbase/pocketbase/issues/4600)): - - Call the `OnTerminate` hook on `TestApp.Cleanup()`. - - Automatically run the DB migrations on initializing the test app with `tests.NewTestApp()`. - -- Added more elaborate warning message when restoring a backup explaining how the operation works. - -- Skip irregular files (symbolic links, sockets, etc.) when restoring a backup zip from the Admin UI or calling `archive.Extract(src, dst)` because they come with too many edge cases and ambiguities. -
- More details - - This was initially reported as security issue (_thanks Harvey Spec_) but in the PocketBase context it is not something that can be exploited without an admin intervention and since the general expectations are that the PocketBase admins can do anything and they are the one who manage their server, this should be treated with the same diligence when using `scp`/`rsync`/`rclone`/etc. with untrusted file sources. - - It is not possible (_or at least I'm not aware how to do that easily_) to perform virus/malicious content scanning on the uploaded backup archive files and some caution is always required when using the Admin UI or running shell commands, hence the backup-restore warning text. - - **Or in other words, if someone sends you a file and tell you to upload it to your server (either as backup zip or manually via scp) obviously you shouldn't do that unless you really trust them.** - - PocketBase is like any other regular application that you run on your server and there is no builtin "sandbox" for what the PocketBase process can execute. This is left to the developers to restrict on application or OS level depending on their needs. If you are self-hosting PocketBase you usually don't have to do that, but if you are offering PocketBase as a service and allow strangers to run their own PocketBase instances on your server then you'll need to implement the isolation mechanisms on your own. -
- - -## v0.22.4 - -- Removed conflicting styles causing the detailed codeblock log data preview to not visualize properly ([#4505](https://github.com/pocketbase/pocketbase/pull/4505)). - -- Minor JSVM improvements: - - Added `$filesystem.fileFromUrl(url, optSecTimeout)` helper. - - Implemented the `FormData` interface and added support for sending `multipart/form-data` requests with `$http.send()` ([#4544](https://github.com/pocketbase/pocketbase/discussions/4544)). - - -## v0.22.3 - -- Fixed the z-index of the current admin dropdown on Safari ([#4492](https://github.com/pocketbase/pocketbase/issues/4492)). - -- Fixed `OnAfterApiError` debug log `nil` error reference ([#4498](https://github.com/pocketbase/pocketbase/issues/4498)). - -- Added the field name as part of the `@request.data.someRelField.*` join to handle the case when a collection has 2 or more relation fields pointing to the same place ([#4500](https://github.com/pocketbase/pocketbase/issues/4500)). - -- Updated Go deps and bumped the min Go version in the GitHub release action to Go 1.22.1 since it comes with [some security fixes](https://github.com/golang/go/issues?q=milestone%3AGo1.22.1). - - -## v0.22.2 - -- Fixed a small regression introduced with v0.22.0 that was causing some missing unknown fields to always return an error instead of applying the specific `nullifyMisingField` resolver option to the query. - - -## v0.22.1 - -- Fixed Admin UI record and collection panels not reinitializing properly on browser back/forward navigation ([#4462](https://github.com/pocketbase/pocketbase/issues/4462)). - -- Initialize `RecordAuthWithOAuth2Event.IsNewRecord` for the `OnRecordBeforeAuthWithOAuth2Request` hook ([#4437](https://github.com/pocketbase/pocketbase/discussions/4437)). - -- Added error checks to the autogenerated Go migrations ([#4448](https://github.com/pocketbase/pocketbase/issues/4448)). - - -## v0.22.0 - -- Added Planning Center OAuth2 provider ([#4393](https://github.com/pocketbase/pocketbase/pull/4393); thanks @alxjsn). - -- Admin UI improvements: - - Autosync collection changes across multiple open browser tabs. - - Fixed vertical image popup preview scrolling. - - Added options to export a subset of collections. - - Added option to import a subset of collections without deleting the others ([#3403](https://github.com/pocketbase/pocketbase/issues/3403)). - -- Added support for back/indirect relation `filter`/`sort` (single and multiple). - The syntax to reference back relation fields is `yourCollection_via_yourRelField.*`. - ⚠️ To avoid excessive joins, the nested relations resolver is now limited to max 6 level depth (the same as `expand`). - _Note that in the future there will be also more advanced and granular options to specify a subset of the fields that are filterable/sortable._ - -- Added support for multiple back/indirect relation `expand` and updated the keys to use the `_via_` reference syntax (`yourCollection_via_yourRelField`). - _To minimize the breaking changes, the old parenthesis reference syntax (`yourCollection(yourRelField)`) will still continue to work but it is soft-deprecated and there will be a console log reminding you to change it to the new one._ - -- ⚠️ Collections and fields are no longer allowed to have `_via_` in their name to avoid collisions with the back/indirect relation reference syntax. - -- Added `jsvm.Config.OnInit` optional config function to allow registering custom Go bindings to the JSVM. - -- Added `@request.context` rule field that can be used to apply a different set of constraints based on the API rule execution context. - For example, to disallow user creation by an OAuth2 auth, you could set for the users Create API rule `@request.context != "oauth2"`. - The currently supported `@request.context` values are: - ``` - default - realtime - protectedFile - oauth2 - ``` - -- Adjusted the `cron.Start()` to start the ticker at the `00` second of the cron interval ([#4394](https://github.com/pocketbase/pocketbase/discussions/4394)). - _Note that the cron format has only minute granularity and there is still no guarantee that the scheduled job will be always executed at the `00` second._ - -- Fixed auto backups cron not reloading properly after app settings change ([#4431](https://github.com/pocketbase/pocketbase/discussions/4431)). - -- Upgraded to `aws-sdk-go-v2` and added special handling for GCS to workaround the previous [GCS headers signature issue](https://github.com/pocketbase/pocketbase/issues/2231) that we had with v2. - _This should also fix the SVG/JSON zero response when using Cloudflare R2 ([#4287](https://github.com/pocketbase/pocketbase/issues/4287#issuecomment-1925168142), [#2068](https://github.com/pocketbase/pocketbase/discussions/2068), [#2952](https://github.com/pocketbase/pocketbase/discussions/2952))._ - _⚠️ If you are using S3 for uploaded files or backups, please verify that you have a green check in the Admin UI for your S3 configuration (I've tested the new version with GCS, MinIO, Cloudflare R2 and Wasabi)._ - -- Added `:each` modifier support for `file` and `relation` type fields (_previously it was supported only for `select` type fields_). - -- Other minor improvements (updated the `ghupdate` plugin to use the configured executable name when printing to the console, fixed the error reporting of `admin update/delete` commands, etc.). - - -## v0.21.3 - -- Ignore the JS required validations for disabled OIDC providers ([#4322](https://github.com/pocketbase/pocketbase/issues/4322)). - -- Allow `HEAD` requests to the `/api/health` endpoint ([#4310](https://github.com/pocketbase/pocketbase/issues/4310)). - -- Fixed the `editor` field value when visualized inside the View collection preview panel. - -- Manually clear all TinyMCE events on editor removal (_workaround for [tinymce#9377](https://github.com/tinymce/tinymce/issues/9377)_). - - -## v0.21.2 - -- Fixed `@request.auth.*` initialization side-effect which caused the current authenticated user email to not being returned in the user auth response ([#2173](https://github.com/pocketbase/pocketbase/issues/2173#issuecomment-1932332038)). - _The current authenticated user email should be accessible always no matter of the `emailVisibility` state._ - -- Fixed `RecordUpsert.RemoveFiles` godoc example. - -- Bumped to `NumCPU()+2` the `thumbGenSem` limit as some users reported that it was too restrictive. - - -## v0.21.1 - -- Small fix for the Admin UI related to the _Settings > Sync_ menu not being visible even when the "Hide controls" toggle is off. - - -## v0.21.0 - -- Added Bitbucket OAuth2 provider ([#3948](https://github.com/pocketbase/pocketbase/pull/3948); thanks @aabajyan). - -- Mark user as verified on confirm password reset ([#4066](https://github.com/pocketbase/pocketbase/issues/4066)). - _If the user email has changed after issuing the reset token (eg. updated by an admin), then the `verified` user state remains unchanged._ - -- Added support for loading a serialized json payload for `multipart/form-data` requests using the special `@jsonPayload` key. - _This is intended to be used primarily by the SDKs to resolve [js-sdk#274](https://github.com/pocketbase/js-sdk/issues/274)._ - -- Added graceful OAuth2 redirect error handling ([#4177](https://github.com/pocketbase/pocketbase/issues/4177)). - _Previously on redirect error we were returning directly a standard json error response. Now on redirect error we'll redirect to a generic OAuth2 failure screen (similar to the success one) and will attempt to auto close the OAuth2 popup._ - _The SDKs are also updated to handle the OAuth2 redirect error and it will be returned as Promise rejection of the `authWithOAuth2()` call._ - -- Exposed `$apis.gzip()` and `$apis.bodyLimit(bytes)` middlewares to the JSVM. - -- Added `TestMailer.SentMessages` field that holds all sent test app emails until cleanup. - -- Optimized the cascade delete of records with multiple `relation` fields. - -- Updated the `serve` and `admin` commands error reporting. - -- Minor Admin UI improvements (reduced the min table row height, added option to duplicate fields, added new TinyMCE codesample plugin languages, hide the collection sync settings when the `Settings.Meta.HideControls` is enabled, etc.) - - -## v0.20.7 - -- Fixed the Admin UI auto indexes update when renaming fields with a common prefix ([#4160](https://github.com/pocketbase/pocketbase/issues/4160)). - - -## v0.20.6 - -- Fixed JSVM types generation for functions with omitted arg types ([#4145](https://github.com/pocketbase/pocketbase/issues/4145)). - -- Updated Go deps. - - -## v0.20.5 - -- Minor CSS fix for the Admin UI to prevent the searchbar within a popup from expanding too much and pushing the controls out of the visible area ([#4079](https://github.com/pocketbase/pocketbase/issues/4079#issuecomment-1876994116)). - - -## v0.20.4 - -- Small fix for a regression introduced with the recent `json` field changes that was causing View collection column expressions recognized as `json` to fail to resolve ([#4072](https://github.com/pocketbase/pocketbase/issues/4072)). - - -## v0.20.3 - -- Fixed the `json` field query comparisons to work correctly with plain JSON values like `null`, `bool` `number`, etc. ([#4068](https://github.com/pocketbase/pocketbase/issues/4068)). - Since there are plans in the future to allow custom SQLite builds and also in some situations it may be useful to be able to distinguish `NULL` from `''`, - for the `json` fields (and for any other future non-standard field) we no longer apply `COALESCE` by default, aka.: - ``` - Dataset: - 1) data: json(null) - 2) data: json('') - - For the filter "data = null" only 1) will resolve to TRUE. - For the filter "data = ''" only 2) will resolve to TRUE. - ``` - -- Minor Go tests improvements - - Sorted the record cascade delete references to ensure that the delete operation will preserve the order of the fired events when running the tests. - - Marked some of the tests as safe for parallel execution to speed up a little the GitHub action build times. - - -## v0.20.2 - -- Added `sleep(milliseconds)` JSVM binding. - _It works the same way as Go `time.Sleep()`, aka. it pauses the goroutine where the JSVM code is running._ - -- Fixed multi-line text paste in the Admin UI search bar ([#4022](https://github.com/pocketbase/pocketbase/discussions/4022)). - -- Fixed the monospace font loading in the Admin UI. - -- Fixed various reported docs and code comment typos. - - -## v0.20.1 - -- Added `--dev` flag and its accompanying `app.IsDev()` method (_in place of the previously removed `--debug`_) to assist during development ([#3918](https://github.com/pocketbase/pocketbase/discussions/3918)). - The `--dev` flag prints in the console "everything" and more specifically: - - the data DB SQL statements - - all `app.Logger().*` logs (debug, info, warning, error, etc.), no matter of the logs persistence settings in the Admin UI - -- Minor Admin UI fixes: - - Fixed the log `error` label text wrapping. - - Added the log `referer` (_when it is from a different source_) and `details` labels in the logs listing. - - Removed the blank current time entry from the logs chart because it was causing confusion when used with custom time ranges. - - Updated the SQL syntax highlighter and keywords autocompletion in the Admin UI to recognize `CAST(x as bool)` expressions. - -- Replaced the default API tests timeout with a new `ApiScenario.Timeout` option ([#3930](https://github.com/pocketbase/pocketbase/issues/3930)). - A negative or zero value means no tests timeout. - If a single API test takes more than 3s to complete it will have a log message visible when the test fails or when `go test -v` flag is used. - -- Added timestamp at the beginning of the generated JSVM types file to avoid creating it everytime with the app startup. - - -## v0.20.0 - -- Added `expand`, `filter`, `fields`, custom query and headers parameters support for the realtime subscriptions. - _Requires JS SDK v0.20.0+ or Dart SDK v0.17.0+._ - - ```js - // JS SDK v0.20.0 - pb.collection("example").subscribe("*", (e) => { - ... - }, { - expand: "someRelField", - filter: "status = 'active'", - fields: "id,expand.someRelField.*:excerpt(100)", - }) - ``` - - ```dart - // Dart SDK v0.17.0 - pb.collection("example").subscribe("*", (e) { - ... - }, - expand: "someRelField", - filter: "status = 'active'", - fields: "id,expand.someRelField.*:excerpt(100)", - ) - ``` - -- Generalized the logs to allow any kind of application logs, not just requests. - - The new `app.Logger()` implements the standard [`log/slog` interfaces](https://pkg.go.dev/log/slog) available with Go 1.21. - ``` - // Go: https://pocketbase.io/docs/go-logging/ - app.Logger().Info("Example message", "total", 123, "details", "lorem ipsum...") - - // JS: https://pocketbase.io/docs/js-logging/ - $app.logger().info("Example message", "total", 123, "details", "lorem ipsum...") - ``` - - For better performance and to minimize blocking on hot paths, logs are currently written with - debounce and on batches: - - 3 seconds after the last debounced log write - - when the batch threshold is reached (currently 200) - - right before app termination to attempt saving everything from the existing logs queue - - Some notable log related changes: - - - ⚠️ Bumped the minimum required Go version to 1.21. - - - ⚠️ Removed `_requests` table in favor of the generalized `_logs`. - _Note that existing logs will be deleted!_ - - - ⚠️ Renamed the following `Dao` log methods: - ```go - Dao.RequestQuery(...) -> Dao.LogQuery(...) - Dao.FindRequestById(...) -> Dao.FindLogById(...) - Dao.RequestsStats(...) -> Dao.LogsStats(...) - Dao.DeleteOldRequests(...) -> Dao.DeleteOldLogs(...) - Dao.SaveRequest(...) -> Dao.SaveLog(...) - ``` - - ⚠️ Removed `app.IsDebug()` and the `--debug` flag. - This was done to avoid the confusion with the new logger and its debug severity level. - If you want to store debug logs you can set `-4` as min log level from the Admin UI. - - - Refactored Admin UI Logs: - - Added new logs table listing. - - Added log settings option to toggle the IP logging for the activity logger. - - Added log settings option to specify a minimum log level. - - Added controls to export individual or bulk selected logs as json. - - Other minor improvements and fixes. - -- Added new `filesystem/System.Copy(src, dest)` method to copy existing files from one location to another. - _This is usually useful when duplicating records with `file` field(s) programmatically._ - -- Added `filesystem.NewFileFromUrl(ctx, url)` helper method to construct a `*filesystem.BytesReader` file from the specified url. - -- OAuth2 related additions: - - - Added new `PKCE()` and `SetPKCE(enable)` OAuth2 methods to indicate whether the PKCE flow is supported or not. - _The PKCE value is currently configurable from the UI only for the OIDC providers._ - _This was added to accommodate OIDC providers that may throw an error if unsupported PKCE params are submitted with the auth request (eg. LinkedIn; see [#3799](https://github.com/pocketbase/pocketbase/discussions/3799#discussioncomment-7640312))._ - - - Added new `displayName` field for each `listAuthMethods()` OAuth2 provider item. - _The value of the `displayName` property is currently configurable from the UI only for the OIDC providers._ - - - Added `expiry` field to the OAuth2 user response containing the _optional_ expiration time of the OAuth2 access token ([#3617](https://github.com/pocketbase/pocketbase/discussions/3617)). - - - Allow a single OAuth2 user to be used for authentication in multiple auth collection. - _⚠️ Because now you can have more than one external provider with `collectionId-provider-providerId` pair, `Dao.FindExternalAuthByProvider(provider, providerId)` method was removed in favour of the more generic `Dao.FindFirstExternalAuthByExpr(expr)`._ - -- Added `onlyVerified` auth collection option to globally disallow authentication requests for unverified users. - -- Added support for single line comments (ex. `// your comment`) in the API rules and filter expressions. - -- Added support for specifying a collection alias in `@collection.someCollection:alias.*`. - -- Soft-deprecated and renamed `app.Cache()` with `app.Store()`. - -- Minor JSVM updates and fixes: - - - Updated `$security.parseUnverifiedJWT(token)` and `$security.parseJWT(token, key)` to return the token payload result as plain object. - - - Added `$apis.requireGuestOnly()` middleware JSVM binding ([#3896](https://github.com/pocketbase/pocketbase/issues/3896)). - -- Use `IS NOT` instead of `!=` as not-equal SQL query operator to handle the cases when comparing with nullable columns or expressions (eg. `json_extract` over `json` field). - _Based on my local dataset I wasn't able to find a significant difference in the performance between the 2 operators, but if you stumble on a query that you think may be affected negatively by this, please report it and I'll test it further._ - -- Added `MaxSize` `json` field option to prevent storing large json data in the db ([#3790](https://github.com/pocketbase/pocketbase/issues/3790)). - _Existing `json` fields are updated with a system migration to have a ~2MB size limit (it can be adjusted from the Admin UI)._ - -- Fixed negative string number normalization support for the `json` field type. - -- Trigger the `app.OnTerminate()` hook on `app.Restart()` call. - _A new bool `IsRestart` field was also added to the `core.TerminateEvent` event._ - -- Fixed graceful shutdown handling and speed up a little the app termination time. - -- Limit the concurrent thumbs generation to avoid high CPU and memory usage in spiky scenarios ([#3794](https://github.com/pocketbase/pocketbase/pull/3794); thanks @t-muehlberger). - _Currently the max concurrent thumbs generation processes are limited to "total of logical process CPUs + 1"._ - _This is arbitrary chosen and may change in the future depending on the users feedback and usage patterns._ - _If you are experiencing OOM errors during large image thumb generations, especially in container environment, you can try defining the `GOMEMLIMIT=500MiB` env variable before starting the executable._ - -- Slightly speed up (~10%) the thumbs generation by changing from cubic (`CatmullRom`) to bilinear (`Linear`) resampling filter (_the quality difference is very little_). - -- Added a default red colored Stderr output in case of a console command error. - _You can now also silence individually custom commands errors using the `cobra.Command.SilenceErrors` field._ - -- Fixed links formatting in the autogenerated html->text mail body. - -- Removed incorrectly imported empty `local('')` font-face declarations. - - -## v0.19.4 - -- Fixed TinyMCE source code viewer textarea styles ([#3715](https://github.com/pocketbase/pocketbase/issues/3715)). - -- Fixed `text` field min/max validators to properly count multi-byte characters ([#3735](https://github.com/pocketbase/pocketbase/issues/3735)). - -- Allowed hyphens in `username` ([#3697](https://github.com/pocketbase/pocketbase/issues/3697)). - _More control over the system fields settings will be available in the future._ - -- Updated the JSVM generated types to use directly the value type instead of `* | undefined` union in functions/methods return declarations. - - -## v0.19.3 - -- Added the release notes to the console output of `./pocketbase update` ([#3685](https://github.com/pocketbase/pocketbase/discussions/3685)). - -- Added missing documentation for the JSVM `$mails.*` bindings. - -- Relaxed the OAuth2 redirect url validation to allow any string value ([#3689](https://github.com/pocketbase/pocketbase/pull/3689); thanks @sergeypdev). - _Note that the redirect url format is still bound to the accepted values by the specific OAuth2 provider._ - - -## v0.19.2 - -- Updated the JSVM generated types ([#3627](https://github.com/pocketbase/pocketbase/issues/3627), [#3662](https://github.com/pocketbase/pocketbase/issues/3662)). - - -## v0.19.1 - -- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset. - An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`. - _This should fix the parsing of whitespace characters around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._ - -- Fixed the `:excerpt(max, withEllipsis?)` `fields` query param modifier to properly add space to the generated text fragment after block tags. - - -## v0.19.0 - -- Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv). - -- Added mailcow OAuth2 provider ([#3364](https://github.com/pocketbase/pocketbase/pull/3364); thanks @thisni1s). - -- Added support for `:excerpt(max, withEllipsis?)` `fields` modifier that will return a short plain text version of any string value (html tags are stripped). - This could be used to minimize the downloaded json data when listing records with large `editor` html values. - ```js - await pb.collection("example").getList(1, 20, { - "fields": "*,description:excerpt(100)" - }) - ``` - -- Several Admin UI improvements: - - Count the total records separately to speed up the query execution for large datasets ([#3344](https://github.com/pocketbase/pocketbase/issues/3344)). - - Enclosed the listing scrolling area within the table so that the horizontal scrollbar and table header are always reachable ([#2505](https://github.com/pocketbase/pocketbase/issues/2505)). - - Allowed opening the record preview/update form via direct URL ([#2682](https://github.com/pocketbase/pocketbase/discussions/2682)). - - Reintroduced the local `date` field tooltip on hover. - - Speed up the listing loading times for records with large `editor` field values by initially fetching only a partial of the records data (the complete record data is loaded on record preview/update). - - Added "Media library" (collection images picker) support for the TinyMCE `editor` field. - - Added support to "pin" collections in the sidebar. - - Added support to manually resize the collections sidebar. - - More clear "Nonempty" field label style. - - Removed the legacy `.woff` and `.ttf` fonts and keep only `.woff2`. - -- Removed the explicit `Content-Type` charset from the realtime response due to compatibility issues with IIS ([#3461](https://github.com/pocketbase/pocketbase/issues/3461)). - _The `Connection:keep-alive` realtime response header was also removed as it is not really used with HTTP2 anyway._ - -- Added new JSVM bindings: - - `new Cookie({ ... })` constructor for creating `*http.Cookie` equivalent value. - - `new SubscriptionMessage({ ... })` constructor for creating a custom realtime subscription payload. - - Soft-deprecated `$os.exec()` in favour of `$os.cmd()` to make it more clear that the call only prepares the command and doesn't execute it. - -- ⚠️ Bumped the min required Go version to 1.19. - - -## v0.18.10 - -- Added global `raw` template function to allow outputting raw/verbatim HTML content in the JSVM templates ([#3476](https://github.com/pocketbase/pocketbase/discussions/3476)). - ``` - {{.description|raw}} - ``` - -- Trimmed view query semicolon and allowed single quotes for column aliases ([#3450](https://github.com/pocketbase/pocketbase/issues/3450#issuecomment-1748044641)). - _Single quotes are usually [not a valid identifier quote characters](https://www.sqlite.org/lang_keywords.html), but for resilience and compatibility reasons SQLite allows them in some contexts where only an identifier is expected._ - -- Bumped the GitHub action to use [min Go 1.21.2](https://github.com/golang/go/issues?q=milestone%3AGo1.21.2) (_the fixed issues are not critical as they are mostly related to the compiler/build tools_). - - -## v0.18.9 - -- Fixed empty thumbs directories not getting deleted on Windows after deleting a record img file ([#3382](https://github.com/pocketbase/pocketbase/issues/3382)). - -- Updated the generated JSVM typings to silent the TS warnings when trying to access a field/method in a Go->TS interface. - - -## v0.18.8 - -- Minor fix for the View collections API Preview and Admin UI listings incorrectly showing the `created` and `updated` fields as `N/A` when the view query doesn't have them. - - -## v0.18.7 - -- Fixed JS error in the Admin UI when listing records with invalid `relation` field value ([#3372](https://github.com/pocketbase/pocketbase/issues/3372)). - _This could happen usually only during custom SQL import scripts or when directly modifying the record field value without data validations._ - -- Updated Go deps and the generated JSVM types. - - -## v0.18.6 - -- Return the response headers and cookies in the `$http.send()` result ([#3310](https://github.com/pocketbase/pocketbase/discussions/3310)). - -- Added more descriptive internal error message for missing user/admin email on password reset requests. - -- Updated Go deps. - - -## v0.18.5 - -- Fixed minor Admin UI JS error in the auth collection options panel introduced with the change from v0.18.4. - - -## v0.18.4 - -- Added escape character (`\`) support in the Admin UI to allow using `select` field values with comma ([#2197](https://github.com/pocketbase/pocketbase/discussions/2197)). - - -## v0.18.3 - -- Exposed a global JSVM `readerToString(reader)` helper function to allow reading Go `io.Reader` values ([#3273](https://github.com/pocketbase/pocketbase/discussions/3273)). - -- Bumped the GitHub action to use [min Go 1.21.1](https://github.com/golang/go/issues?q=milestone%3AGo1.21.1+label%3ACherryPickApproved) for the prebuilt executable since it contains some minor `html/template` and `net/http` security fixes. - - -## v0.18.2 - -- Prevent breaking the record form in the Admin UI in case the browser's localStorage quota has been exceeded when uploading or storing large `editor` values ([#3265](https://github.com/pocketbase/pocketbase/issues/3265)). - -- Updated docs and missing JSVM typings. - -- Exposed additional crypto primitives under the `$security.*` JSVM namespace ([#3273](https://github.com/pocketbase/pocketbase/discussions/3273)): - ```js - // HMAC with SHA256 - $security.hs256("hello", "secret") - - // HMAC with SHA512 - $security.hs512("hello", "secret") - - // compare 2 strings with a constant time - $security.equal(hash1, hash2) - ``` - - -## v0.18.1 - -- Excluded the local temp dir from the backups ([#3261](https://github.com/pocketbase/pocketbase/issues/3261)). - - -## v0.18.0 - -- Simplified the `serve` command to accept domain name(s) as argument to reduce any additional manual hosts setup that sometimes previously was needed when deploying on production ([#3190](https://github.com/pocketbase/pocketbase/discussions/3190)). - ```sh - ./pocketbase serve yourdomain.com - ``` - -- Added `fields` wildcard (`*`) support. - -- Added option to upload a backup file from the Admin UI ([#2599](https://github.com/pocketbase/pocketbase/issues/2599)). - -- Registered a custom Deflate compressor to speedup (_nearly 2-3x_) the backups generation for the sake of a small zip size increase. - _Based on several local tests, `pb_data` of ~500MB (from which ~350MB+ are several hundred small files) results in a ~280MB zip generated for ~11s (previously it resulted in ~250MB zip but for ~35s)._ - -- Added the application name as part of the autogenerated backup name for easier identification ([#3066](https://github.com/pocketbase/pocketbase/issues/3066)). - -- Added new `SmtpConfig.LocalName` option to specify a custom domain name (or IP address) for the initial EHLO/HELO exchange ([#3097](https://github.com/pocketbase/pocketbase/discussions/3097)). - _This is usually required for verification purposes only by some SMTP providers, such as on-premise [Gmail SMTP-relay](https://support.google.com/a/answer/2956491)._ - -- Added `NoDecimal` `number` field option. - -- `editor` field improvements: - - Added new "Strip urls domain" option to allow controlling the default TinyMCE urls behavior (_default to `false` for new content_). - - Normalized pasted text while still preserving links, lists, tables, etc. formatting ([#3257](https://github.com/pocketbase/pocketbase/issues/3257)). - -- Added option to auto generate admin and auth record passwords from the Admin UI. - -- Added JSON validation and syntax highlight for the `json` field in the Admin UI ([#3191](https://github.com/pocketbase/pocketbase/issues/3191)). - -- Added datetime filter macros: - ``` - // all macros are UTC based - @second - @now second number (0-59) - @minute - @now minute number (0-59) - @hour - @now hour number (0-23) - @weekday - @now weekday number (0-6) - @day - @now day number - @month - @now month number - @year - @now year number - @todayStart - beginning of the current day as datetime string - @todayEnd - end of the current day as datetime string - @monthStart - beginning of the current month as datetime string - @monthEnd - end of the current month as datetime string - @yearStart - beginning of the current year as datetime string - @yearEnd - end of the current year as datetime string - ``` - -- Added cron expression macros ([#3132](https://github.com/pocketbase/pocketbase/issues/3132)): - ``` - @yearly - "0 0 1 1 *" - @annually - "0 0 1 1 *" - @monthly - "0 0 1 * *" - @weekly - "0 0 * * 0" - @daily - "0 0 * * *" - @midnight - "0 0 * * *" - @hourly - "0 * * * *" - ``` - -- ⚠️ Added offset argument `Dao.FindRecordsByFilter(collection, filter, sort, limit, offset, [params...])`. - _If you don't need an offset, you can set it to `0`._ - -- To minimize the footguns with `Dao.FindFirstRecordByFilter()` and `Dao.FindRecordsByFilter()`, the functions now supports an optional placeholder params argument that is safe to be populated with untrusted user input. - The placeholders are in the same format as when binding regular SQL parameters. - ```go - // unsanitized and untrusted filter variables - status := "..." - author := "..." - - app.Dao().FindFirstRecordByFilter("articles", "status={:status} && author={:author}", dbx.Params{ - "status": status, - "author": author, - }) - - app.Dao().FindRecordsByFilter("articles", "status={:status} && author={:author}", "-created", 10, 0, dbx.Params{ - "status": status, - "author": author, - }) - ``` - -- Added JSVM `$mails.*` binds for the corresponding Go [mails package](https://pkg.go.dev/github.com/pocketbase/pocketbase/mails) functions. - -- Added JSVM helper crypto primitives under the `$security.*` namespace: - ```js - $security.md5(text) - $security.sha256(text) - $security.sha512(text) - ``` - -- ⚠️ Deprecated `RelationOptions.DisplayFields` in favor of the new `SchemaField.Presentable` option to avoid the duplication when a single collection is referenced more than once and/or by multiple other collections. - -- ⚠️ Fill the `LastVerificationSentAt` and `LastResetSentAt` fields only after a successfull email send ([#3121](https://github.com/pocketbase/pocketbase/issues/3121)). - -- ⚠️ Skip API `fields` json transformations for non 20x responses ([#3176](https://github.com/pocketbase/pocketbase/issues/3176)). - -- ⚠️ Changes to `tests.ApiScenario` struct: - - - The `ApiScenario.AfterTestFunc` now receive as 3rd argument `*http.Response` pointer instead of `*echo.Echo` as the latter is not really useful in this context. - ```go - // old - AfterTestFunc: func(t *testing.T, app *tests.TestApp, e *echo.Echo) - - // new - AfterTestFunc: func(t *testing.T, app *tests.TestApp, res *http.Response) - ``` - - - The `ApiScenario.TestAppFactory` now accept the test instance as argument and no longer expect an error as return result ([#3025](https://github.com/pocketbase/pocketbase/discussions/3025#discussioncomment-6592272)). - ```go - // old - TestAppFactory: func() (*tests.TestApp, error) - - // new - TestAppFactory: func(t *testing.T) *tests.TestApp - ``` - _Returning a `nil` app instance from the factory results in test failure. You can enforce a custom test failure by calling `t.Fatal(err)` inside the factory._ - -- Bumped the min required TLS version to 1.2 in order to improve the cert reputation score. - -- Reduced the default JSVM prewarmed pool size to 25 to reduce the initial memory consumptions (_you can manually adjust the pool size with `--hooksPool=50` if you need to, but the default should suffice for most cases_). - -- Update `gocloud.dev` dependency to v0.34 and explicitly set the new `NoTempDir` fileblob option to prevent the cross-device link error introduced with v0.33. - -- Other minor Admin UI and docs improvements. - - -## v0.17.7 - -- Fixed the autogenerated `down` migrations to properly revert the old collection rules in case a change was made in `up` ([#3192](https://github.com/pocketbase/pocketbase/pull/3192); thanks @impact-merlinmarek). - _Existing `down` migrations can't be fixed but that should be ok as usually the `down` migrations are rarely used against prod environments since they can cause data loss and, while not ideal, the previous old behavior of always setting the rules to `null/nil` is safer than not updating the rules at all._ - -- Updated some Go deps. - - -## v0.17.6 - -- Fixed JSVM `require()` file path error when using Windows-style path delimiters ([#3163](https://github.com/pocketbase/pocketbase/issues/3163#issuecomment-1685034438)). - - -## v0.17.5 - -- Added quotes around the wrapped view query columns introduced with v0.17.4. - - -## v0.17.4 - -- Fixed Views record retrieval when numeric id is used ([#3110](https://github.com/pocketbase/pocketbase/issues/3110)). - _With this fix we also now properly recognize `CAST(... as TEXT)` and `CAST(... as BOOLEAN)` as `text` and `bool` fields._ - -- Fixed `relation` "Cascade delete" tooltip message ([#3098](https://github.com/pocketbase/pocketbase/issues/3098)). - -- Fixed jsvm error message prefix on failed migrations ([#3103](https://github.com/pocketbase/pocketbase/pull/3103); thanks @nzhenev). - -- Disabled the initial Admin UI admins counter cache when there are no initial admins to allow detecting externally created accounts (eg. with the `admin` command) ([#3106](https://github.com/pocketbase/pocketbase/issues/3106)). - -- Downgraded `google/go-cloud` dependency to v0.32.0 until v0.34.0 is released to prevent the `os.TempDir` `cross-device link` errors as too many users complained about it. - - -## v0.17.3 - -- Fixed Docker `cross-device link` error when creating `pb_data` backups on a local mounted volume ([#3089](https://github.com/pocketbase/pocketbase/issues/3089)). - -- Fixed the error messages for relation to views ([#3090](https://github.com/pocketbase/pocketbase/issues/3090)). - -- Always reserve space for the scrollbar to reduce the layout shifts in the Admin UI records listing due to the deprecated `overflow: overlay`. - -- Enabled lazy loading for the Admin UI thumb images. - - -## v0.17.2 - -- Soft-deprecated `$http.send({ data: object, ... })` in favour of `$http.send({ body: rawString, ... })` - to allow sending non-JSON body with the request ([#3058](https://github.com/pocketbase/pocketbase/discussions/3058)). - The existing `data` prop will still work, but it is recommended to use `body` instead (_to send JSON you can use `JSON.stringify(...)` as body value_). - -- Added `core.RealtimeConnectEvent.IdleTimeout` field to allow specifying a different realtime idle timeout duration per client basis ([#3054](https://github.com/pocketbase/pocketbase/discussions/3054)). - -- Fixed `apis.RequestData` deprecation log note ([#3068](https://github.com/pocketbase/pocketbase/pull/3068); thanks @gungjodi). - - -## v0.17.1 - -- Use relative path when redirecting to the OAuth2 providers page in the Admin UI to support subpath deployments ([#3026](https://github.com/pocketbase/pocketbase/pull/3026); thanks @sonyarianto). - -- Manually trigger the `OnBeforeServe` hook for `tests.ApiScenario` ([#3025](https://github.com/pocketbase/pocketbase/discussions/3025)). - -- Trigger the JSVM `cronAdd()` handler only on app `serve` to prevent unexpected (and eventually duplicated) cron handler calls when custom console commands are used ([#3024](https://github.com/pocketbase/pocketbase/discussions/3024#discussioncomment-6592703)). - -- The `console.log()` messages are now written to the `stdout` instead of `stderr`. - - -## v0.17.0 - -- New more detailed guides for using PocketBase as framework (both Go and JS). - _If you find any typos or issues with the docs please report them in https://github.com/pocketbase/site._ - -- Added new experimental JavaScript app hooks binding via [goja](https://github.com/dop251/goja). - They are available by default with the prebuilt executable if you create `*.pb.js` file(s) in the `pb_hooks` directory. - Lower your expectations because the integration comes with some limitations. For more details please check the [Extend with JavaScript](https://pocketbase.io/docs/js-overview/) guide. - Optionally, you can also enable the JS app hooks as part of a custom Go build for dynamic scripting but you need to register the `jsvm` plugin manually: - ```go - jsvm.MustRegister(app core.App, config jsvm.Config{}) - ``` - -- Added Instagram OAuth2 provider ([#2534](https://github.com/pocketbase/pocketbase/pull/2534); thanks @pnmcosta). - -- Added VK OAuth2 provider ([#2533](https://github.com/pocketbase/pocketbase/pull/2533); thanks @imperatrona). - -- Added Yandex OAuth2 provider ([#2762](https://github.com/pocketbase/pocketbase/pull/2762); thanks @imperatrona). - -- Added new fields to `core.ServeEvent`: - ```go - type ServeEvent struct { - App App - Router *echo.Echo - // new fields - Server *http.Server // allows adjusting the HTTP server config (global timeouts, TLS options, etc.) - CertManager *autocert.Manager // allows adjusting the autocert options (cache dir, host policy, etc.) - } - ``` - -- Added `record.ExpandedOne(rel)` and `record.ExpandedAll(rel)` helpers to retrieve casted single or multiple expand relations from the already loaded "expand" Record data. - -- Added rule and filter record `Dao` helpers: - ```go - app.Dao().FindRecordsByFilter("posts", "title ~ 'lorem ipsum' && visible = true", "-created", 10) - app.Dao().FindFirstRecordByFilter("posts", "slug='test' && active=true") - app.Dao().CanAccessRecord(record, requestInfo, rule) - ``` - -- Added `Dao.WithoutHooks()` helper to create a new `Dao` from the current one but without the create/update/delete hooks. - -- Use a default fetch function that will return all relations in case the `fetchFunc` argument of `Dao.ExpandRecord(record, expands, fetchFunc)` and `Dao.ExpandRecords(records, expands, fetchFunc)` is `nil`. - -- For convenience it is now possible to call `Dao.RecordQuery(collectionModelOrIdentifier)` with just the collection id or name. - In case an invalid collection id/name string is passed the query will be resolved with cancelled context error. - -- Refactored `apis.ApiError` validation errors serialization to allow `map[string]error` and `map[string]any` when generating the public safe formatted `ApiError.Data`. - -- Added support for wrapped API errors (_in case Go 1.20+ is used with multiple wrapped errors, the first `apis.ApiError` takes precedence_). - -- Added `?download=1` file query parameter to the file serving endpoint to force the browser to always download the file and not show its preview. - -- Added new utility `github.com/pocketbase/pocketbase/tools/template` subpackage to assist with rendering HTML templates using the standard Go `html/template` and `text/template` syntax. - -- Added `types.JsonMap.Get(k)` and `types.JsonMap.Set(k, v)` helpers for the cases where the type aliased direct map access is not allowed (eg. in [goja](https://pkg.go.dev/github.com/dop251/goja#hdr-Maps_with_methods)). - -- Soft-deprecated `security.NewToken()` in favor of `security.NewJWT()`. - -- `Hook.Add()` and `Hook.PreAdd` now returns a unique string identifier that could be used to remove the registered hook handler via `Hook.Remove(handlerId)`. - -- Changed the after* hooks to be called right before writing the user response, allowing users to return response errors from the after hooks. - There is also no longer need for returning explicitly `hook.StopPropagtion` when writing custom response body in a hook because we will skip the finalizer response body write if a response was already "committed". - -- ⚠️ Renamed `*Options{}` to `Config{}` for consistency and replaced the unnecessary pointers with their value equivalent to keep the applied configuration defaults isolated within their function calls: - ```go - old: pocketbase.NewWithConfig(config *pocketbase.Config) *pocketbase.PocketBase - new: pocketbase.NewWithConfig(config pocketbase.Config) *pocketbase.PocketBase - - old: core.NewBaseApp(config *core.BaseAppConfig) *core.BaseApp - new: core.NewBaseApp(config core.BaseAppConfig) *core.BaseApp - - old: apis.Serve(app core.App, options *apis.ServeOptions) error - new: apis.Serve(app core.App, config apis.ServeConfig) (*http.Server, error) - - old: jsvm.MustRegisterMigrations(app core.App, options *jsvm.MigrationsOptions) - new: jsvm.MustRegister(app core.App, config jsvm.Config) - - old: ghupdate.MustRegister(app core.App, rootCmd *cobra.Command, options *ghupdate.Options) - new: ghupdate.MustRegister(app core.App, rootCmd *cobra.Command, config ghupdate.Config) - - old: migratecmd.MustRegister(app core.App, rootCmd *cobra.Command, options *migratecmd.Options) - new: migratecmd.MustRegister(app core.App, rootCmd *cobra.Command, config migratecmd.Config) - ``` - -- ⚠️ Changed the type of `subscriptions.Message.Data` from `string` to `[]byte` because `Data` usually is a json bytes slice anyway. - -- ⚠️ Renamed `models.RequestData` to `models.RequestInfo` and soft-deprecated `apis.RequestData(c)` in favor of `apis.RequestInfo(c)` to avoid the stuttering with the `Data` field. - _The old `apis.RequestData()` method still works to minimize the breaking changes but it is recommended to replace it with `apis.RequestInfo(c)`._ - -- ⚠️ Changes to the List/Search APIs - - Added new query parameter `?skipTotal=1` to skip the `COUNT` query performed with the list/search actions ([#2965](https://github.com/pocketbase/pocketbase/discussions/2965)). - If `?skipTotal=1` is set, the response fields `totalItems` and `totalPages` will have `-1` value (this is to avoid having different JSON responses and to differentiate from the zero default). - With the latest JS SDK 0.16+ and Dart SDK v0.11+ versions `skipTotal=1` is set by default for the `getFirstListItem()` and `getFullList()` requests. - - - The count and regular select statements also now executes concurrently, meaning that we no longer perform normalization over the `page` parameter and in case the user - request a page that doesn't exist (eg. `?page=99999999`) we'll return empty `items` array. - - - Reverted the default `COUNT` column to `id` as there are some common situations where it can negatively impact the query performance. - Additionally, from this version we also set `PRAGMA temp_store = MEMORY` so that also helps with the temp B-TREE creation when `id` is used. - _There are still scenarios where `COUNT` queries with `rowid` executes faster, but the majority of the time when nested relations lookups are used it seems to have the opposite effect (at least based on the benchmarks dataset)._ - -- ⚠️ Disallowed relations to views **from non-view** collections ([#3000](https://github.com/pocketbase/pocketbase/issues/3000)). - The change was necessary because I wasn't able to find an efficient way to track view changes and the previous behavior could have too many unexpected side-effects (eg. view with computed ids). - There is a system migration that will convert the existing view `relation` fields to `json` (multiple) and `text` (single) fields. - This could be a breaking change if you have `relation` to view and use `expand` or some of the `relation` view fields as part of a collection rule. - -- ⚠️ Added an extra `action` argument to the `Dao` hooks to allow skipping the default persist behavior. - In preparation for the logs generalization, the `Dao.After*Func` methods now also allow returning an error. - -- Allowed `0` as `RelationOptions.MinSelect` value to avoid the ambiguity between 0 and non-filled input value ([#2817](https://github.com/pocketbase/pocketbase/discussions/2817)). - -- Fixed zero-default value not being used if the field is not explicitly set when manually creating records ([#2992](https://github.com/pocketbase/pocketbase/issues/2992)). - Additionally, `record.Get(field)` will now always return normalized value (the same as in the json serialization) for consistency and to avoid ambiguities with what is stored in the related DB table. - The schema fields columns `DEFAULT` definition was also updated for new collections to ensure that `NULL` values can't be accidentally inserted. - -- Fixed `migrate down` not returning the correct `lastAppliedMigrations()` when the stored migration applied time is in seconds. - -- Fixed realtime delete event to be called after the record was deleted from the DB (_including transactions and cascade delete operations_). - -- Other minor fixes and improvements (typos and grammar fixes, updated dependencies, removed unnecessary 404 error check in the Admin UI, etc.). - - -## v0.16.10 - -- Added multiple valued fields (`relation`, `select`, `file`) normalizations to ensure that the zero-default value of a newly created multiple field is applied for already existing data ([#2930](https://github.com/pocketbase/pocketbase/issues/2930)). - - -## v0.16.9 - -- Register the `eagerRequestInfoCache` middleware only for the internal `api` group routes to avoid conflicts with custom route handlers ([#2914](https://github.com/pocketbase/pocketbase/issues/2914)). - - -## v0.16.8 - -- Fixed unique validator detailed error message not being returned when camelCase field name is used ([#2868](https://github.com/pocketbase/pocketbase/issues/2868)). - -- Updated the index parser to allow no space between the table name and the columns list ([#2864](https://github.com/pocketbase/pocketbase/discussions/2864#discussioncomment-6373736)). - -- Updated go deps. - - -## v0.16.7 - -- Minor optimization for the list/search queries to use `rowid` with the `COUNT` statement when available. - _This eliminates the temp B-TREE step when executing the query and for large datasets (eg. 150k) it could have 10x improvement (from ~580ms to ~60ms)._ - - -## v0.16.6 - -- Fixed collection index column sort normalization in the Admin UI ([#2681](https://github.com/pocketbase/pocketbase/pull/2681); thanks @SimonLoir). - -- Removed unnecessary admins count in `apis.RequireAdminAuthOnlyIfAny()` middleware ([#2726](https://github.com/pocketbase/pocketbase/pull/2726); thanks @svekko). - -- Fixed `multipart/form-data` request bind not populating map array values ([#2763](https://github.com/pocketbase/pocketbase/discussions/2763#discussioncomment-6278902)). - -- Upgraded npm and Go dependencies. - - -## v0.16.5 - -- Fixed the Admin UI serialization of implicit relation display fields ([#2675](https://github.com/pocketbase/pocketbase/issues/2675)). - -- Reset the Admin UI sort in case the active sort collection field is renamed or deleted. - - -## v0.16.4 - -- Fixed the selfupdate command not working on Windows due to missing `.exe` in the extracted binary path ([#2589](https://github.com/pocketbase/pocketbase/discussions/2589)). - _Note that the command on Windows will work from v0.16.4+ onwards, meaning that you still will have to update manually one more time to v0.16.4._ - -- Added `int64`, `int32`, `uint`, `uint64` and `uint32` support when scanning `types.DateTime` ([#2602](https://github.com/pocketbase/pocketbase/discussions/2602)) - -- Updated dependencies. - - -## v0.16.3 - -- Fixed schema fields sort not working on Safari/Gnome Web ([#2567](https://github.com/pocketbase/pocketbase/issues/2567)). - -- Fixed default `PRAGMA`s not being applied for new connections ([#2570](https://github.com/pocketbase/pocketbase/discussions/2570)). - - -## v0.16.2 - -- Fixed backups archive not excluding the local `backups` directory on Windows ([#2548](https://github.com/pocketbase/pocketbase/discussions/2548#discussioncomment-5979712)). - -- Changed file field to not use `dataTransfer.effectAllowed` when dropping files since it is not reliable and consistent across different OS and browsers ([#2541](https://github.com/pocketbase/pocketbase/issues/2541)). - -- Auto register the initial generated snapshot migration to prevent incorrectly reapplying the snapshot on Docker restart ([#2551](https://github.com/pocketbase/pocketbase/discussions/2551)). - -- Fixed missing view id field error message typo. - - -## v0.16.1 - -- Fixed backup restore not working in a container environment when `pb_data` is mounted as volume ([#2519](https://github.com/pocketbase/pocketbase/issues/2519)). - -- Fixed Dart SDK realtime API preview example ([#2523](https://github.com/pocketbase/pocketbase/pull/2523); thanks @xFrann). - -- Fixed typo in the backups create panel ([#2526](https://github.com/pocketbase/pocketbase/pull/2526); thanks @dschissler). - -- Removed unnecessary slice length check in `list.ExistInSlice` ([#2527](https://github.com/pocketbase/pocketbase/pull/2527); thanks @KunalSin9h). - -- Avoid mutating the cached request data on OAuth2 user create ([#2535](https://github.com/pocketbase/pocketbase/discussions/2535)). - -- Fixed Export Collections "Download as JSON" ([#2540](https://github.com/pocketbase/pocketbase/issues/2540)). - -- Fixed file field drag and drop not working in Firefox and Safari ([#2541](https://github.com/pocketbase/pocketbase/issues/2541)). - - -## v0.16.0 - -- Added automated backups (_+ cron rotation_) APIs and UI for the `pb_data` directory. - The backups can be also initialized programmatically using `app.CreateBackup("backup.zip")`. - There is also experimental restore method - `app.RestoreBackup("backup.zip")` (_currently works only on UNIX systems as it relies on execve_). - The backups can be stored locally or in external S3 storage (_it has its own configuration, separate from the file uploads storage filesystem_). - -- Added option to limit the returned API fields using the `?fields` query parameter. - The "fields picker" is applied for `SearchResult.Items` and every other JSON response. For example: - ```js - // original: {"id": "RECORD_ID", "name": "abc", "description": "...something very big...", "items": ["id1", "id2"], "expand": {"items": [{"id": "id1", "name": "test1"}, {"id": "id2", "name": "test2"}]}} - // output: {"name": "abc", "expand": {"items": [{"name": "test1"}, {"name": "test2"}]}} - const result = await pb.collection("example").getOne("RECORD_ID", { - expand: "items", - fields: "name,expand.items.name", - }) - ``` - -- Added new `./pocketbase update` command to selfupdate the prebuilt executable (with option to generate a backup of your `pb_data`). - -- Added new `./pocketbase admin` console command: - ```sh - // creates new admin account - ./pocketbase admin create test@example.com 123456890 - - // changes the password of an existing admin account - ./pocketbase admin update test@example.com 0987654321 - - // deletes single admin account (if exists) - ./pocketbase admin delete test@example.com - ``` - -- Added `apis.Serve(app, options)` helper to allow starting the API server programmatically. - -- Updated the schema fields Admin UI for "tidier" fields visualization. - -- Updated the logs "real" user IP to check for `Fly-Client-IP` header and changed the `X-Forward-For` header to use the first non-empty leftmost-ish IP as it the closest to the "real IP". - -- Added new `tools/archive` helper subpackage for managing archives (_currently works only with zip_). - -- Added new `tools/cron` helper subpackage for scheduling task using cron-like syntax (_this eventually may get exported in the future in a separate repo_). - -- Added new `Filesystem.List(prefix)` helper to retrieve a flat list with all files under the provided prefix. - -- Added new `App.NewBackupsFilesystem()` helper to create a dedicated filesystem abstraction for managing app data backups. - -- Added new `App.OnTerminate()` hook (_executed right before app termination, eg. on `SIGTERM` signal_). - -- Added `accept` file field attribute with the field MIME types ([#2466](https://github.com/pocketbase/pocketbase/pull/2466); thanks @Nikhil1920). - -- Added support for multiple files sort in the Admin UI ([#2445](https://github.com/pocketbase/pocketbase/issues/2445)). - -- Added support for multiple relations sort in the Admin UI. - -- Added `meta.isNew` to the OAuth2 auth JSON response to indicate a newly OAuth2 created PocketBase user. diff --git a/core/pb/README.md b/core/pb/README.md deleted file mode 100755 index f4efbef..0000000 --- a/core/pb/README.md +++ /dev/null @@ -1,9 +0,0 @@ -download https://pocketbase.io/docs/ - -```bash -cd pb -xattr -d com.apple.quarantine pocketbase # for Macos -./pocketbase migrate up # for first run -./pocketbase --dev admin create test@example.com 123467890 # If you don't have an initial account, please use this command to create it -./pocketbase serve -``` \ No newline at end of file diff --git a/core/pb/pb_hooks/main.pb.js b/core/pb/pb_hooks/main.pb.js deleted file mode 100644 index 7f585e8..0000000 --- a/core/pb/pb_hooks/main.pb.js +++ /dev/null @@ -1,74 +0,0 @@ -routerAdd( - "POST", - "/save", - (c) => { - const data = $apis.requestInfo(c).data - // console.log(data) - - let dir = $os.getenv("PROJECT_DIR") - if (dir) { - dir = dir + "/" - } - // console.log(dir) - - const collection = $app.dao().findCollectionByNameOrId("documents") - const record = new Record(collection) - const form = new RecordUpsertForm($app, record) - - // or form.loadRequest(request, "") - form.loadData({ - workflow: data.workflow, - insight: data.insight, - task: data.task, - }) - - // console.log(dir + data.file) - const f1 = $filesystem.fileFromPath(dir + data.file) - form.addFiles("files", f1) - - form.submit() - - return c.json(200, record) - }, - $apis.requireRecordAuth() -) - -routerAdd( - "GET", - "/insight_dates", - (c) => { - let result = arrayOf( - new DynamicModel({ - created: "", - }) - ) - - $app.dao().db().newQuery("SELECT DISTINCT DATE(created) as created FROM insights").all(result) - - return c.json( - 200, - result.map((r) => r.created) - ) - }, - $apis.requireAdminAuth() -) - -routerAdd( - "GET", - "/article_dates", - (c) => { - let result = arrayOf( - new DynamicModel({ - created: "", - }) - ) - - $app.dao().db().newQuery("SELECT DISTINCT DATE(created) as created FROM articles").all(result) - - return c.json( - 200, - result.map((r) => r.created) - ) - }, - $apis.requireAdminAuth() -) diff --git a/core/pb/pb_migrations/1712449900_created_article_translation.js b/core/pb/pb_migrations/1712449900_created_article_translation.js deleted file mode 100644 index e968bfe..0000000 --- a/core/pb/pb_migrations/1712449900_created_article_translation.js +++ /dev/null @@ -1,55 +0,0 @@ -/// -migrate((db) => { - const collection = new Collection({ - "id": "bc3g5s66bcq1qjp", - "created": "2024-04-07 00:31:40.644Z", - "updated": "2024-04-07 00:31:40.644Z", - "name": "article_translation", - "type": "base", - "system": false, - "schema": [ - { - "system": false, - "id": "t2jqr7cs", - "name": "title", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "dr9kt3dn", - "name": "abstract", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - } - ], - "indexes": [], - "listRule": null, - "viewRule": null, - "createRule": null, - "updateRule": null, - "deleteRule": null, - "options": {} - }); - - return Dao(db).saveCollection(collection); -}, (db) => { - const dao = new Dao(db); - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp"); - - return dao.deleteCollection(collection); -}) diff --git a/core/pb/pb_migrations/1712450012_created_articles.js b/core/pb/pb_migrations/1712450012_created_articles.js deleted file mode 100644 index 3a3048b..0000000 --- a/core/pb/pb_migrations/1712450012_created_articles.js +++ /dev/null @@ -1,154 +0,0 @@ -/// -migrate((db) => { - const collection = new Collection({ - "id": "lft7642skuqmry7", - "created": "2024-04-07 00:33:32.746Z", - "updated": "2024-04-07 00:33:32.746Z", - "name": "articles", - "type": "base", - "system": false, - "schema": [ - { - "system": false, - "id": "yttga2xi", - "name": "title", - "type": "text", - "required": true, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "99dnnabt", - "name": "url", - "type": "url", - "required": true, - "presentable": false, - "unique": false, - "options": { - "exceptDomains": [], - "onlyDomains": [] - } - }, - { - "system": false, - "id": "itplfdwh", - "name": "abstract", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "iorna912", - "name": "content", - "type": "text", - "required": true, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "judmyhfm", - "name": "publish_time", - "type": "number", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "noDecimal": false - } - }, - { - "system": false, - "id": "um6thjt5", - "name": "author", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "kvzodbm3", - "name": "images", - "type": "json", - "required": false, - "presentable": false, - "unique": false, - "options": { - "maxSize": 2000000 - } - }, - { - "system": false, - "id": "eviha2ho", - "name": "snapshot", - "type": "file", - "required": false, - "presentable": false, - "unique": false, - "options": { - "mimeTypes": [], - "thumbs": [], - "maxSelect": 1, - "maxSize": 5242880, - "protected": false - } - }, - { - "system": false, - "id": "tukuros5", - "name": "translation_result", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "bc3g5s66bcq1qjp", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": 1, - "displayFields": null - } - } - ], - "indexes": [], - "listRule": null, - "viewRule": null, - "createRule": null, - "updateRule": null, - "deleteRule": null, - "options": {} - }); - - return Dao(db).saveCollection(collection); -}, (db) => { - const dao = new Dao(db); - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7"); - - return dao.deleteCollection(collection); -}) diff --git a/core/pb/pb_migrations/1712450207_updated_article_translation.js b/core/pb/pb_migrations/1712450207_updated_article_translation.js deleted file mode 100644 index 09c03b7..0000000 --- a/core/pb/pb_migrations/1712450207_updated_article_translation.js +++ /dev/null @@ -1,52 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "tmwf6icx", - "name": "raw", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "lft7642skuqmry7", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": 1, - "displayFields": null - } - })) - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "hsckiykq", - "name": "content", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - // remove - collection.schema.removeField("tmwf6icx") - - // remove - collection.schema.removeField("hsckiykq") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1712450442_created_insights.js b/core/pb/pb_migrations/1712450442_created_insights.js deleted file mode 100644 index 0ddac56..0000000 --- a/core/pb/pb_migrations/1712450442_created_insights.js +++ /dev/null @@ -1,73 +0,0 @@ -/// -migrate((db) => { - const collection = new Collection({ - "id": "h3c6pqhnrfo4oyf", - "created": "2024-04-07 00:40:42.781Z", - "updated": "2024-04-07 00:40:42.781Z", - "name": "insights", - "type": "base", - "system": false, - "schema": [ - { - "system": false, - "id": "5hp4ulnc", - "name": "content", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "gsozubhx", - "name": "articles", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "lft7642skuqmry7", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - }, - { - "system": false, - "id": "iiwkyzr2", - "name": "docx", - "type": "file", - "required": false, - "presentable": false, - "unique": false, - "options": { - "mimeTypes": [], - "thumbs": [], - "maxSelect": 1, - "maxSize": 5242880, - "protected": false - } - } - ], - "indexes": [], - "listRule": null, - "viewRule": null, - "createRule": null, - "updateRule": null, - "deleteRule": null, - "options": {} - }); - - return Dao(db).saveCollection(collection); -}, (db) => { - const dao = new Dao(db); - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf"); - - return dao.deleteCollection(collection); -}) diff --git a/core/pb/pb_migrations/1713322324_created_sites.js b/core/pb/pb_migrations/1713322324_created_sites.js deleted file mode 100644 index 2672a1b..0000000 --- a/core/pb/pb_migrations/1713322324_created_sites.js +++ /dev/null @@ -1,54 +0,0 @@ -/// -migrate((db) => { - const collection = new Collection({ - "id": "sma08jpi5rkoxnh", - "created": "2024-04-17 02:52:04.291Z", - "updated": "2024-04-17 02:52:04.291Z", - "name": "sites", - "type": "base", - "system": false, - "schema": [ - { - "system": false, - "id": "6qo4l7og", - "name": "url", - "type": "url", - "required": false, - "presentable": false, - "unique": false, - "options": { - "exceptDomains": null, - "onlyDomains": null - } - }, - { - "system": false, - "id": "lgr1quwi", - "name": "per_hours", - "type": "number", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": 1, - "max": 24, - "noDecimal": false - } - } - ], - "indexes": [], - "listRule": null, - "viewRule": null, - "createRule": null, - "updateRule": null, - "deleteRule": null, - "options": {} - }); - - return Dao(db).saveCollection(collection); -}, (db) => { - const dao = new Dao(db); - const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh"); - - return dao.deleteCollection(collection); -}) diff --git a/core/pb/pb_migrations/1713328405_updated_sites.js b/core/pb/pb_migrations/1713328405_updated_sites.js deleted file mode 100644 index f1f8417..0000000 --- a/core/pb/pb_migrations/1713328405_updated_sites.js +++ /dev/null @@ -1,74 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "6qo4l7og", - "name": "url", - "type": "url", - "required": true, - "presentable": false, - "unique": false, - "options": { - "exceptDomains": null, - "onlyDomains": null - } - })) - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "lgr1quwi", - "name": "per_hours", - "type": "number", - "required": true, - "presentable": false, - "unique": false, - "options": { - "min": 1, - "max": 24, - "noDecimal": false - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "6qo4l7og", - "name": "url", - "type": "url", - "required": false, - "presentable": false, - "unique": false, - "options": { - "exceptDomains": null, - "onlyDomains": null - } - })) - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "lgr1quwi", - "name": "per_hours", - "type": "number", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": 1, - "max": 24, - "noDecimal": false - } - })) - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1713329959_updated_sites.js b/core/pb/pb_migrations/1713329959_updated_sites.js deleted file mode 100644 index a49e806..0000000 --- a/core/pb/pb_migrations/1713329959_updated_sites.js +++ /dev/null @@ -1,27 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "8x8n2a47", - "name": "activated", - "type": "bool", - "required": false, - "presentable": false, - "unique": false, - "options": {} - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("sma08jpi5rkoxnh") - - // remove - collection.schema.removeField("8x8n2a47") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1714803585_updated_articles.js b/core/pb/pb_migrations/1714803585_updated_articles.js deleted file mode 100644 index 453e21f..0000000 --- a/core/pb/pb_migrations/1714803585_updated_articles.js +++ /dev/null @@ -1,44 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "iorna912", - "name": "content", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "iorna912", - "name": "content", - "type": "text", - "required": true, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1714835361_updated_insights.js b/core/pb/pb_migrations/1714835361_updated_insights.js deleted file mode 100644 index eb29b5b..0000000 --- a/core/pb/pb_migrations/1714835361_updated_insights.js +++ /dev/null @@ -1,31 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "d13734ez", - "name": "tag", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // remove - collection.schema.removeField("d13734ez") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1714955881_updated_articles.js b/core/pb/pb_migrations/1714955881_updated_articles.js deleted file mode 100644 index 1989cb4..0000000 --- a/core/pb/pb_migrations/1714955881_updated_articles.js +++ /dev/null @@ -1,31 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "pwy2iz0b", - "name": "source", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // remove - collection.schema.removeField("pwy2iz0b") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715823361_created_tags.js b/core/pb/pb_migrations/1715823361_created_tags.js deleted file mode 100644 index d252a58..0000000 --- a/core/pb/pb_migrations/1715823361_created_tags.js +++ /dev/null @@ -1,51 +0,0 @@ -/// -migrate((db) => { - const collection = new Collection({ - "id": "nvf6k0yoiclmytu", - "created": "2024-05-16 01:36:01.108Z", - "updated": "2024-05-16 01:36:01.108Z", - "name": "tags", - "type": "base", - "system": false, - "schema": [ - { - "system": false, - "id": "0th8uax4", - "name": "name", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - }, - { - "system": false, - "id": "l6mm7m90", - "name": "activated", - "type": "bool", - "required": false, - "presentable": false, - "unique": false, - "options": {} - } - ], - "indexes": [], - "listRule": null, - "viewRule": null, - "createRule": null, - "updateRule": null, - "deleteRule": null, - "options": {} - }); - - return Dao(db).saveCollection(collection); -}, (db) => { - const dao = new Dao(db); - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu"); - - return dao.deleteCollection(collection); -}) diff --git a/core/pb/pb_migrations/1715824265_updated_insights.js b/core/pb/pb_migrations/1715824265_updated_insights.js deleted file mode 100644 index dd7d152..0000000 --- a/core/pb/pb_migrations/1715824265_updated_insights.js +++ /dev/null @@ -1,52 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // remove - collection.schema.removeField("d13734ez") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "j65p3jji", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "d13734ez", - "name": "tag", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - // remove - collection.schema.removeField("j65p3jji") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852342_updated_insights.js b/core/pb/pb_migrations/1715852342_updated_insights.js deleted file mode 100644 index 6a6f8c2..0000000 --- a/core/pb/pb_migrations/1715852342_updated_insights.js +++ /dev/null @@ -1,16 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - collection.listRule = null - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852638_updated_insights.js b/core/pb/pb_migrations/1715852638_updated_insights.js deleted file mode 100644 index 42efa86..0000000 --- a/core/pb/pb_migrations/1715852638_updated_insights.js +++ /dev/null @@ -1,16 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - collection.viewRule = null - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852847_updated_users.js b/core/pb/pb_migrations/1715852847_updated_users.js deleted file mode 100644 index bfe64a3..0000000 --- a/core/pb/pb_migrations/1715852847_updated_users.js +++ /dev/null @@ -1,33 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("_pb_users_auth_") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "8d9woe75", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("_pb_users_auth_") - - // remove - collection.schema.removeField("8d9woe75") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852924_updated_articles.js b/core/pb/pb_migrations/1715852924_updated_articles.js deleted file mode 100644 index ff0501c..0000000 --- a/core/pb/pb_migrations/1715852924_updated_articles.js +++ /dev/null @@ -1,33 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "famdh2fv", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - // remove - collection.schema.removeField("famdh2fv") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852932_updated_articles.js b/core/pb/pb_migrations/1715852932_updated_articles.js deleted file mode 100644 index 29b0cca..0000000 --- a/core/pb/pb_migrations/1715852932_updated_articles.js +++ /dev/null @@ -1,18 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("lft7642skuqmry7") - - collection.listRule = null - collection.viewRule = null - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852952_updated_article_translation.js b/core/pb/pb_migrations/1715852952_updated_article_translation.js deleted file mode 100644 index f960931..0000000 --- a/core/pb/pb_migrations/1715852952_updated_article_translation.js +++ /dev/null @@ -1,33 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "lbxw5pra", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - // remove - collection.schema.removeField("lbxw5pra") - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1715852974_updated_article_translation.js b/core/pb/pb_migrations/1715852974_updated_article_translation.js deleted file mode 100644 index b597bea..0000000 --- a/core/pb/pb_migrations/1715852974_updated_article_translation.js +++ /dev/null @@ -1,18 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - collection.listRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - collection.viewRule = "@request.auth.id != \"\" && @request.auth.tag:each ?~ tag:each" - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("bc3g5s66bcq1qjp") - - collection.listRule = null - collection.viewRule = null - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1716165809_updated_tags.js b/core/pb/pb_migrations/1716165809_updated_tags.js deleted file mode 100644 index 7a9baf6..0000000 --- a/core/pb/pb_migrations/1716165809_updated_tags.js +++ /dev/null @@ -1,44 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "0th8uax4", - "name": "name", - "type": "text", - "required": true, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "0th8uax4", - "name": "name", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1716168332_updated_insights.js b/core/pb/pb_migrations/1716168332_updated_insights.js deleted file mode 100644 index aa03a18..0000000 --- a/core/pb/pb_migrations/1716168332_updated_insights.js +++ /dev/null @@ -1,48 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "j65p3jji", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": 1, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("h3c6pqhnrfo4oyf") - - // update - collection.schema.addField(new SchemaField({ - "system": false, - "id": "j65p3jji", - "name": "tag", - "type": "relation", - "required": false, - "presentable": false, - "unique": false, - "options": { - "collectionId": "nvf6k0yoiclmytu", - "cascadeDelete": false, - "minSelect": null, - "maxSelect": null, - "displayFields": null - } - })) - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1717321896_updated_tags.js b/core/pb/pb_migrations/1717321896_updated_tags.js deleted file mode 100644 index 9ddbbf8..0000000 --- a/core/pb/pb_migrations/1717321896_updated_tags.js +++ /dev/null @@ -1,18 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - collection.listRule = "@request.auth.id != \"\"" - collection.viewRule = "@request.auth.id != \"\"" - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - collection.listRule = null - collection.viewRule = null - - return dao.saveCollection(collection) -}) diff --git a/core/pb/pb_migrations/1725263585_updated_tags.js b/core/pb/pb_migrations/1725263585_updated_tags.js deleted file mode 100644 index fb293e2..0000000 --- a/core/pb/pb_migrations/1725263585_updated_tags.js +++ /dev/null @@ -1,31 +0,0 @@ -/// -migrate((db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - // add - collection.schema.addField(new SchemaField({ - "system": false, - "id": "vkgtujiz", - "name": "explaination", - "type": "text", - "required": false, - "presentable": false, - "unique": false, - "options": { - "min": null, - "max": null, - "pattern": "" - } - })) - - return dao.saveCollection(collection) -}, (db) => { - const dao = new Dao(db) - const collection = dao.findCollectionByNameOrId("nvf6k0yoiclmytu") - - // remove - collection.schema.removeField("vkgtujiz") - - return dao.saveCollection(collection) -}) diff --git a/core/requirements.txt b/core/requirements.txt index 167f0c7..4381b02 100644 --- a/core/requirements.txt +++ b/core/requirements.txt @@ -1,12 +1,8 @@ openai loguru -gne -jieba -httpx pocketbase pydantic -uvicorn json_repair==0.* beautifulsoup4 -fastapi -requests \ No newline at end of file +requests +crawlee[playwright] \ No newline at end of file diff --git a/core/run.sh b/core/run.sh new file mode 100755 index 0000000..d7668cf --- /dev/null +++ b/core/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -o allexport +source .env +set +o allexport + +if ! pgrep -x "pocketbase" > /dev/null; then + if ! netstat -tuln | grep ":8090" > /dev/null && ! lsof -i :8090 > /dev/null; then + echo "Starting PocketBase..." + ../pb/pocketbase serve --http=127.0.0.1:8090 & + else + echo "Port 8090 is already in use." + fi +else + echo "PocketBase is already running." +fi + +python general_process.py \ No newline at end of file diff --git a/core/run_task.sh b/core/run_task.sh new file mode 100755 index 0000000..db1f993 --- /dev/null +++ b/core/run_task.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -o allexport +source .env +set +o allexport + +if ! pgrep -x "pocketbase" > /dev/null; then + if ! netstat -tuln | grep ":8090" > /dev/null && ! lsof -i :8090 > /dev/null; then + echo "Starting PocketBase..." + ../pb/pocketbase serve --http=127.0.0.1:8090 & + else + echo "Port 8090 is already in use." + fi +else + echo "PocketBase is already running." +fi + +python tasks.py \ No newline at end of file diff --git a/core/scrapers/README.md b/core/scrapers/README.md deleted file mode 100644 index 3e04db0..0000000 --- a/core/scrapers/README.md +++ /dev/null @@ -1,56 +0,0 @@ -We provide a general page parser that can intelligently retrieve article lists from sources. For each article URL, it first attempts to use `gne` for parsing, and if that fails, it will try using `llm`. - -This solution allows scanning and extracting information from most general news and portal sources. - -**However, we strongly recommend that users develop custom parsers for specific sources tailored to their actual business scenarios for more ideal and efficient scanning.** - -We also provide a parser specifically for WeChat public articles (mp.weixin.qq.com). - -**If you are willing to contribute your custom source-specific parsers to this repository, we would greatly appreciate it!** - -## Custom Source Parser Development Specifications - -### Specifications - -**Remember It should be an asynchronous function** - -1. **The parser should be able to intelligently distinguish between article list pages and article detail pages.** -2. **The parser's input parameters should only include `url` and `logger`:** - - `url` is the complete address of the source (type `str`). - - `logger` is the logging object (please do not configure a separate logger for your custom source parser). -3. **The parser's output should include `flag` and `result`, formatted as `tuple[int, Union[set, dict]]`:** - - If the `url` is an article list page, `flag` returns `1`, and `result` returns a tuple of all article page URLs (`set`). - - If the `url` is an article page, `flag` returns `11`, and `result` returns all article details (`dict`), in the following format: - - ```python - {'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]} - ``` - - _Note: `title` and `content` cannot be empty._ - - **Note: `publish_time` should be in the format `"%Y%m%d"` (date only, no `-`). If the scraper cannot fetch it, use the current date.** - - - If parsing fails, `flag` returns `0`, and `result` returns an empty dictionary `{}`. - - _`pipeline` will try other parsing solutions (if any) upon receiving `flag` 0._ - - - If page retrieval fails (e.g., network issues), `flag` returns `-7`, and `result` returns an empty dictionary `{}`. - - _`pipeline` will not attempt to parse again in the same process upon receiving `flag` -7._ - -### Registration - -After writing your scraper, place the scraper program in this folder and register the scraper in `scraper_map` under `__init__.py`, similar to: - -```python -{'domain': 'crawler def name'} -``` - -It is recommended to use urllib.parse to get the domain: - -```python -from urllib.parse import urlparse - -parsed_url = urlparse("site's url") -domain = parsed_url.netloc -``` \ No newline at end of file diff --git a/core/scrapers/README_CN.md b/core/scrapers/README_CN.md deleted file mode 100644 index 3e39510..0000000 --- a/core/scrapers/README_CN.md +++ /dev/null @@ -1,56 +0,0 @@ -我们提供了一个通用页面解析器,该解析器可以智能获取信源文章列表。对于每个文章 URL,会先尝试使用 `gne` 进行解析,如果失败,再尝试使用 `llm` 进行解析。 - -通过这个方案,可以实现对大多数普通新闻类、门户类信源的扫描和信息提取。 - -**然而,我们依然强烈建议用户根据实际业务场景编写针对特定信源的专有解析器,以实现更理想且高效的扫描。** - -此外,我们提供了一个专门针对微信公众号文章(mp.weixin.qq.com)的解析器。 - -**如果您愿意将您撰写的特定信源专有解析器贡献至本代码仓库,我们将不胜感激!** - -## 专有信源解析器开发规范 - -### 规范 - -**记住:这应该是一个异步函数** - -1. **解析器应能智能区分文章列表页面和文章详情页面。** -2. **解析器入参只包括 `url` 和 `logger` 两项:** - - `url` 是信源完整地址(`str` 类型) - - `logger` 是日志对象(请勿为您的专有信源解析器单独配置 `logger`) -3. **解析器出参包括 `flag` 和 `result` 两项,格式为 `tuple[int, Union[set, dict]]`:** - - 如果 `url` 是文章列表页面,`flag` 返回 `1`,`result` 返回解析出的全部文章页面 URL 集合(`set`)。 - - 如果 `url` 是文章页面,`flag` 返回 `11`,`result` 返回解析出的全部文章详情(`dict`),格式如下: - - ```python - {'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]} - ``` - - _注意:`title` 和 `content` 两项不能为空。_ - - **注意:`publish_time` 格式为 `"%Y%m%d"`(仅日期,没有 `-`),如果爬虫抓不到可以用当天日期。** - - - 如果解析失败,`flag` 返回 `0`,`result` 返回空字典 `{}`。 - - _`pipeline` 收到 `flag` 0 会尝试其他解析方案(如有)。_ - - - 如果页面获取失败(如网络问题),`flag` 返回 `-7`,`result` 返回空字典 `{}`。 - - _`pipeline` 收到 `flag` -7, 同一进程内不会再次尝试解析。_ - -### 注册 - -写好爬虫后,将爬虫程序放在该文件夹,并在 `__init__.py` 下的 `scraper_map` 中注册爬虫,类似: - -```python -{'domain': 'crawler def name'} -``` - -建议使用 urllib.parse 获取 domain: - -```python -from urllib.parse import urlparse - -parsed_url = urlparse("site's url") -domain = parsed_url.netloc -``` \ No newline at end of file diff --git a/core/scrapers/README_de.md b/core/scrapers/README_de.md deleted file mode 100644 index 4200859..0000000 --- a/core/scrapers/README_de.md +++ /dev/null @@ -1,56 +0,0 @@ -Wir bieten einen allgemeinen Seitenparser an, der intelligent Artikellisten von Quellen abrufen kann. Für jede Artikel-URL wird zuerst versucht, `gne` zur Analyse zu verwenden. Falls dies fehlschlägt, wird `llm` als Alternative genutzt. - -Diese Lösung ermöglicht das Scannen und Extrahieren von Informationen aus den meisten allgemeinen Nachrichtenquellen und Portalen. - -**Wir empfehlen jedoch dringend, benutzerdefinierte Parser für spezifische Quellen zu entwickeln, die auf Ihre tatsächlichen Geschäftsszenarien abgestimmt sind, um eine idealere und effizientere Erfassung zu erreichen.** - -Wir stellen auch einen speziellen Parser für WeChat-Artikel (mp.weixin.qq.com) bereit. - -**Falls Sie bereit sind, Ihre speziell entwickelten Parser für bestimmte Quellen zu diesem Code-Repository beizutragen, wären wir Ihnen sehr dankbar!** - -## Entwicklungsspezifikationen für benutzerdefinierte Quellparser - -### Spezifikationen - -**Denken Sie daran: Es sollte eine asynchrone Funktion sein** - -1. **Der Parser sollte in der Lage sein, intelligent zwischen Artikel-Listen-Seiten und Artikel-Detailseiten zu unterscheiden.** -2. **Die Eingabeparameter des Parsers sollten nur `url` und `logger` umfassen:** - - `url` ist die vollständige Adresse der Quelle (Typ `str`). - - `logger` ist das Protokollierungsobjekt (bitte konfigurieren Sie keinen separaten Logger für Ihren benutzerdefinierten Quellparser). -3. **Die Ausgabe des Parsers sollte `flag` und `result` umfassen, im Format `tuple[int, Union[set, dict]]`:** - - Wenn die `url` eine Artikellisten-Seite ist, gibt `flag` `1` zurück, und `result` gibt eine satz aller Artikel-URLs (`set`) zurück. - - Wenn die `url` eine Artikelseite ist, gibt `flag` `11` zurück, und `result` gibt alle Artikeldetails (`dict`) zurück, im folgenden Format: - - ```python - {'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]} - ``` - - _Hinweis: `title` und `content` dürfen nicht leer sein._ - - **Hinweis: Das `publish_time`-Format muss `"%Y%m%d"` (nur Datum, ohne `-`) sein. Wenn der Scraper es nicht erfassen kann, verwenden Sie das aktuelle Datum.** - - - Wenn die Analyse fehlschlägt, gibt `flag` `0` zurück, und `result` gibt ein leeres Wörterbuch `{}` zurück. - - _Der `pipeline` versucht andere Analysemethoden (falls vorhanden), wenn `flag` 0 zurückgegeben wird._ - - - Wenn das Abrufen der Seite fehlschlägt (z. B. aufgrund von Netzwerkproblemen), gibt `flag` `-7` zurück, und `result` gibt ein leeres Wörterbuch `{}` zurück. - - _Der `pipeline` wird im gleichen Prozess keine weiteren Versuche zur Analyse unternehmen, wenn `flag` -7 zurückgegeben wird._ - -### Registrierung - -Nach dem Schreiben Ihres Scrapers platzieren Sie das Scraper-Programm in diesem Ordner und registrieren den Scraper in `scraper_map` in `__init__.py`, wie folgt: - -```python -{'domain': 'Crawler-Funktionsname'} -``` - -Es wird empfohlen, urllib.parse zur Ermittlung der domain zu verwenden: - -```python -from urllib.parse import urlparse - -parsed_url = urlparse("l'URL du site") -domain = parsed_url.netloc -``` \ No newline at end of file diff --git a/core/scrapers/README_fr.md b/core/scrapers/README_fr.md deleted file mode 100644 index d96583f..0000000 --- a/core/scrapers/README_fr.md +++ /dev/null @@ -1,56 +0,0 @@ -Nous proposons un analyseur de pages général capable de récupérer intelligemment les listes d'articles de sources d'information. Pour chaque URL d'article, il tente d'abord d'utiliser `gne` pour l'analyse, et en cas d'échec, il essaie d'utiliser `llm`. - -Cette solution permet de scanner et d'extraire des informations de la plupart des sources de nouvelles générales et des portails d'information. - -**Cependant, nous recommandons vivement aux utilisateurs de développer des analyseurs personnalisés pour des sources spécifiques en fonction de leurs scénarios d'affaires réels afin d'obtenir une analyse plus idéale et plus efficace.** - -Nous fournissons également un analyseur spécialement conçu pour les articles publics WeChat (mp.weixin.qq.com). - -**Si vous êtes disposé à contribuer vos analyseurs spécifiques à certaines sources à ce dépôt de code, nous vous en serions très reconnaissants !** - -## Spécifications pour le Développement d'Analyseurs Spécifiques - -### Spécifications - -**N'oubliez pas : il devrait s'agir d'une fonction asynchrone** - -1. **L'analyseur doit être capable de distinguer intelligemment entre les pages de liste d'articles et les pages de détail des articles.** -2. **Les paramètres d'entrée de l'analyseur doivent uniquement inclure `url` et `logger` :** - - `url` est l'adresse complète de la source (type `str`). - - `logger` est l'objet de journalisation (ne configurez pas de logger séparé pour votre analyseur spécifique). -3. **Les paramètres de sortie de l'analyseur doivent inclure `flag` et `result`, formatés comme `tuple[int, Union[set, dict]]` :** - - Si l'URL est une page de liste d'articles, `flag` renvoie `1` et `result` renvoie la set de toutes les URL des pages d'articles (`set`). - - Si l'URL est une page d'article, `flag` renvoie `11` et `result` renvoie tous les détails de l'article (`dict`), au format suivant : - - ```python - {'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]} - ``` - - _Remarque : `title` et `content` ne peuvent pas être vides._ - - **Remarque : `publish_time` doit être au format `"%Y%m%d"` (date uniquement, sans `-`). Si le scraper ne peut pas le récupérer, utilisez la date du jour.** - - - En cas d'échec de l'analyse, `flag` renvoie `0` et `result` renvoie un dictionnaire vide `{}`. - - _Le `pipeline` essaiera d'autres solutions d'analyse (si disponibles) après avoir reçu `flag` 0._ - - - En cas d'échec de la récupération de la page (par exemple, problème réseau), `flag` renvoie `-7` et `result` renvoie un dictionnaire vide `{}`. - - _Le `pipeline` n'essaiera pas de réanalyser dans le même processus après avoir reçu `flag` -7._ - -### Enregistrement - -Après avoir écrit votre scraper, placez le programme du scraper dans ce dossier et enregistrez le scraper dans `scraper_map` sous `__init__.py`, de manière similaire : - -```python -{'domain': 'nom de la fonction de crawler'} -``` - -Il est recommandé d'utiliser urllib.parse pour obtenir le domain : - -```python -from urllib.parse import urlparse - -parsed_url = urlparse("l'URL du site") -domain = parsed_url.netloc -``` \ No newline at end of file diff --git a/core/scrapers/README_jp.md b/core/scrapers/README_jp.md deleted file mode 100644 index d5c6bc0..0000000 --- a/core/scrapers/README_jp.md +++ /dev/null @@ -1,56 +0,0 @@ -汎用ページパーサーを提供しており、このパーサーは信頼できるソースから記事リストをインテリジェントに取得します。各記事URLに対して、まず `gne` を使用して解析を試み、失敗した場合は `llm` を使用して解析します。 - -このソリューションにより、ほとんどの一般的なニュースサイトやポータルサイトからの情報をスキャンして抽出することができます。 - -**しかし、より理想的かつ効率的なスキャンを実現するために、ユーザー自身のビジネスシナリオに応じた特定のソース専用のパーサーを開発することを強くお勧めします。** - -また、WeChat 公共アカウントの記事(mp.weixin.qq.com)に特化したパーサーも提供しています。 - -**特定のソース専用に開発したパーサーをこのリポジトリに貢献していただける場合は、大変感謝いたします!** - -## 特定ソースパーサー開発規範 - -### 規範 - -**覚えておいてください:それは非同期関数でなければなりません** - -1. **パーサーは、記事リストページと記事詳細ページをインテリジェントに区別できる必要があります。** -2. **パーサーの入力パラメーターは `url` と `logger` のみを含むべきです:** - - `url` はソースの完全なアドレス(`str` タイプ) - - `logger` はロギングオブジェクト(専用のロガーを構成しないでください) -3. **パーサーの出力は `flag` と `result` を含み、形式は `tuple[int, Union[set, dict]]`:** - - `url` が記事リストページの場合、`flag` は `1` を返し、`result` はすべての記事ページURLのコレクション(`set`)を返します。 - - `url` が記事ページの場合、`flag` は `11` を返し、`result` はすべての記事詳細(`dict`)を返します。形式は以下の通りです: - - ```python - {'url': str, 'title': str, 'author': str, 'publish_time': str, 'content': str, 'abstract': str, 'images': [str]} - ``` - - _注意:`title` と `content` は空であってはなりません。_ - - **注意:`publish_time` の形式は `"%Y%m%d"`(日付のみ、`-` はなし)である必要があります。スクレイパーが取得できない場合は、当日の日付を使用してください。** - - - 解析に失敗した場合、`flag` は `0` を返し、`result` は空の辞書 `{}` を返します。 - - _`pipeline` は `flag` 0 を受け取ると他の解析ソリューション(存在する場合)を試みます。_ - - - ページの取得に失敗した場合(例えば、ネットワークの問題)、`flag` は `-7` を返し、`result` は空の辞書 `{}` を返します。 - - _`pipeline` は `flag` -7 を受け取ると、同一プロセス内では再解析を試みません。_ - -### 登録 - -スクレイパーを作成したら、このフォルダにプログラムを配置し、`__init__.py` の `scraper_map` にスクレイパーを次のように登録してください: - -```python -{'domain': 'スクレイパー関数名'} -``` - -domain の取得には urllib.parse を使用することをお勧めします: - -```python -from urllib.parse import urlparse - -parsed_url = urlparse("l'URL du site") -domain = parsed_url.netloc -``` \ No newline at end of file diff --git a/core/scrapers/__init__.py b/core/scrapers/__init__.py deleted file mode 100644 index 8e7a776..0000000 --- a/core/scrapers/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .mp_crawler import mp_crawler - - -scraper_map = {'mp.weixin.qq.com': mp_crawler} diff --git a/core/scrapers/general_crawler.py b/core/scrapers/general_crawler.py deleted file mode 100644 index 46766e8..0000000 --- a/core/scrapers/general_crawler.py +++ /dev/null @@ -1,228 +0,0 @@ -# -*- coding: utf-8 -*- -# when you use this general crawler, remember followings -# When you receive flag -7, it means that the problem occurs in the HTML fetch process. -# When you receive flag 0, it means that the problem occurred during the content parsing process. -# when you receive flag 1, the result would be a tuple, means that the input url is possible a article_list page -# and the set contains the url of the articles. -# when you receive flag 11, you will get the dict contains the title, content, url, date, and the source of the article. - -from gne import GeneralNewsExtractor -import httpx -from bs4 import BeautifulSoup -from datetime import datetime -from urllib.parse import urlparse -from llms.openai_wrapper import openai_llm -# from llms.siliconflow_wrapper import sfa_llm -from bs4.element import Comment -from utils.general_utils import extract_and_convert_dates -import asyncio -import json_repair -import os -from typing import Union -from requests.compat import urljoin -from scrapers import scraper_map - - -model = os.environ.get('HTML_PARSE_MODEL', 'gpt-4o-mini-2024-07-18') -header = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'} -extractor = GeneralNewsExtractor() - - -def tag_visible(element: Comment) -> bool: - if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]: - return False - if isinstance(element, Comment): - return False - return True - - -def text_from_soup(soup: BeautifulSoup) -> str: - res = [] - texts = soup.find_all(string=True) - visible_texts = filter(tag_visible, texts) - for v in visible_texts: - res.append(v) - text = "\n".join(res) - return text.strip() - - -sys_info = '''Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content: - -- The document's title -- The complete main content, as it appears in the HTML, comprising all textual elements considered part of the core article body -- The publication time in its original format found within the HTML - -Ensure your response fits the following JSON structure, accurately reflecting the extracted data without modification: - -```json -{ - "title": "The Document's Exact Title", - "content": "All the unaltered primary text content from the article", - "publish_time": "Original Publication Time as per HTML" -} -``` - -It is essential that your output adheres strictly to this format, with each field filled based on the untouched information extracted directly from the HTML source.''' - - -async def general_crawler(url: str, logger) -> tuple[int, Union[set, dict]]: - """ - Return article information dict and flag, negative number is error, 0 is no result, 1 is for article_list page, - 11 is success - - main work flow: - (for weixin public account articles, which startswith mp.weixin.qq use mp_crawler) - first get the content with httpx - then judge is article list (return all article url and flag 1) or article detail page - then try to use gne to extract the information - when fail, try to use a llm to analysis the html - """ - - # 0. if there's a scraper for this domain, use it (such as mp.weixin.qq.com) - parsed_url = urlparse(url) - domain = parsed_url.netloc - base_url = f"{parsed_url.scheme}://{domain}" - if domain in scraper_map: - return await scraper_map[domain](url, logger) - - # 1. get the content with httpx - async with httpx.AsyncClient() as client: - for retry in range(2): - try: - response = await client.get(url, headers=header, timeout=30) - response.raise_for_status() - break - except Exception as e: - if retry < 1: - logger.info(f"can not reach\n{e}\nwaiting 1min") - await asyncio.sleep(60) - else: - logger.error(e) - return -7, {} - - # 2. judge is article list (return all article url and flag 1) or article detail page - page_source = response.text - if page_source: - text = page_source - else: - try: - text = response.content.decode('utf-8') - except UnicodeDecodeError: - try: - text = response.content.decode('gbk') - except Exception as e: - logger.error(f"can not decode html {e}") - return -7, {} - - soup = BeautifulSoup(text, "html.parser") - # Note: The scheme used here is very crude, - # it is recommended to write a separate parser for specific business scenarios - # Parse all URLs - if len(url) < 50: - urls = set() - for link in soup.find_all("a", href=True): - absolute_url = urljoin(base_url, link["href"]) - format_url = urlparse(absolute_url) - # only record same domain links - if not format_url.netloc or format_url.netloc != domain: - continue - # remove hash fragment - absolute_url = f"{format_url.scheme}://{format_url.netloc}{format_url.path}{format_url.params}{format_url.query}" - if absolute_url != url: - urls.add(absolute_url) - - if len(urls) > 24: - logger.info(f"{url} is more like an article list page, find {len(urls)} urls with the same netloc") - return 1, urls - - # 3. try to use gne to extract the information - try: - result = extractor.extract(text) - if 'meta' in result: - del result['meta'] - - if result['title'].startswith('服务器错误') or result['title'].startswith('您访问的页面') or result[ - 'title'].startswith('403') \ - or result['content'].startswith('This website uses cookies') or result['title'].startswith('出错了'): - logger.warning(f"can not get {url} from the Internet") - return -7, {} - - if len(result['title']) < 4 or len(result['content']) < 24: - logger.info(f"gne extract not good: {result}") - result = None - except Exception as e: - logger.info(f"gne extract error: {e}") - result = None - - # 4. try to use a llm to analysis the html - if not result: - html_text = text_from_soup(soup) - html_lines = html_text.split('\n') - html_lines = [line.strip() for line in html_lines if line.strip()] - html_text = "\n".join(html_lines) - if len(html_text) > 29999: - logger.info(f"{url} content too long for llm parsing") - return 0, {} - - if not html_text or html_text.startswith('服务器错误') or html_text.startswith( - '您访问的页面') or html_text.startswith('403') \ - or html_text.startswith('出错了'): - logger.warning(f"can not get {url} from the Internet") - return -7, {} - - messages = [ - {"role": "system", "content": sys_info}, - {"role": "user", "content": html_text} - ] - llm_output = openai_llm(messages, model=model, logger=logger, temperature=0.01) - result = json_repair.repair_json(llm_output, return_objects=True) - logger.debug(f"decoded_object: {result}") - - if not isinstance(result, dict): - logger.debug("failed to parse from llm output") - return 0, {} - - if 'title' not in result or 'content' not in result: - logger.debug("llm parsed result not good") - return 0, {} - - # Extract the picture link, it will be empty if it cannot be extracted. - image_links = [] - images = soup.find_all("img") - for img in images: - try: - image_links.append(urljoin(base_url, img["src"])) - except KeyError: - continue - result["images"] = image_links - - # Extract the author information, if it cannot be extracted, it will be empty. - author_element = soup.find("meta", {"name": "author"}) - if author_element: - result["author"] = author_element["content"] - else: - result["author"] = "" - - # 5. post process - date_str = extract_and_convert_dates(result.get('publish_time', '')) - if date_str: - result['publish_time'] = date_str - else: - result['publish_time'] = datetime.strftime(datetime.today(), "%Y%m%d") - - from_site = domain.replace('www.', '') - from_site = from_site.split('.')[0] - result['content'] = f"[from {from_site}] {result['content']}" - - try: - meta_description = soup.find("meta", {"name": "description"}) - if meta_description: - result['abstract'] = f"[from {from_site}] {meta_description['content'].strip()}" - else: - result['abstract'] = '' - except Exception: - result['abstract'] = '' - - result['url'] = url - return 11, result diff --git a/core/scrapers/mp_crawler.py b/core/scrapers/mp_crawler.py deleted file mode 100644 index f7d666c..0000000 --- a/core/scrapers/mp_crawler.py +++ /dev/null @@ -1,129 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import Union -import httpx -from bs4 import BeautifulSoup -from datetime import datetime -import re -import asyncio - - -header = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'} - - -async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]: - if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'): - logger.warning(f'{url} is not a mp url, you should not use this function') - return -5, {} - - url = url.replace("http://", "https://", 1) - - async with httpx.AsyncClient() as client: - for retry in range(2): - try: - response = await client.get(url, headers=header, timeout=30) - response.raise_for_status() - break - except Exception as e: - if retry < 1: - logger.info(f"{e}\nwaiting 1min") - await asyncio.sleep(60) - else: - logger.warning(e) - return -7, {} - - soup = BeautifulSoup(response.text, 'html.parser') - - if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'): - # 文章目录 - urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')} - simple_urls = set() - for url in urls: - cut_off_point = url.find('chksm=') - if cut_off_point != -1: - url = url[:cut_off_point - 1] - simple_urls.add(url) - return 1, simple_urls - - # Get the original release date first - pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'" - match = re.search(pattern, response.text) - - if match: - date_only = match.group(1) - publish_time = date_only.replace('-', '') - else: - publish_time = datetime.strftime(datetime.today(), "%Y%m%d") - - # Get description content from < meta > tag - try: - meta_description = soup.find('meta', attrs={'name': 'description'}) - summary = meta_description['content'].strip() if meta_description else '' - # card_info = soup.find('div', id='img-content') - # Parse the required content from the < div > tag - rich_media_title = soup.find('h1', id='activity-name').text.strip() \ - if soup.find('h1', id='activity-name') \ - else soup.find('h1', class_='rich_media_title').text.strip() - profile_nickname = soup.find('div', class_='wx_follow_nickname').text.strip() - except Exception as e: - logger.warning(f"not mp format: {url}\n{e}") - # For mp.weixin.qq.com types, mp_crawler won't work, and most likely neither will the other two - return -7, {} - - if not rich_media_title or not profile_nickname: - logger.warning(f"failed to analysis {url}, no title or profile_nickname") - return -7, {} - - # Parse text and image links within the content interval - # Todo This scheme is compatible with picture sharing MP articles, but the pictures of the content cannot be obtained, - # because the structure of this part is completely different, and a separate analysis scheme needs to be written - # (but the proportion of this type of article is not high). - texts = [] - images = set() - content_area = soup.find('div', id='js_content') - if content_area: - # 提取文本 - for section in content_area.find_all(['section', 'p'], recursive=False): # 遍历顶级section - text = section.get_text(separator=' ', strip=True) - if text and text not in texts: - texts.append(text) - - for img in content_area.find_all('img', class_='rich_pages wxw-img'): - img_src = img.get('data-src') or img.get('src') - if img_src: - images.add(img_src) - cleaned_texts = [t for t in texts if t.strip()] - content = '\n'.join(cleaned_texts) - else: - logger.warning(f"failed to analysis contents {url}") - return 0, {} - if content: - content = f"[from {profile_nickname}]{content}" - else: - # If the content does not have it, but the summary has it, it means that it is an mp of the picture sharing type. - # At this time, you can use the summary as the content. - content = f"[from {profile_nickname}]{summary}" - - # Get links to images in meta property = "og: image" and meta property = "twitter: image" - og_image = soup.find('meta', property='og:image') - twitter_image = soup.find('meta', property='twitter:image') - if og_image: - images.add(og_image['content']) - if twitter_image: - images.add(twitter_image['content']) - - if rich_media_title == summary or not summary: - abstract = '' - else: - abstract = f"[from {profile_nickname}]{rich_media_title}——{summary}" - - return 11, { - 'title': rich_media_title, - 'author': profile_nickname, - 'publish_time': publish_time, - 'abstract': abstract, - 'content': content, - 'images': list(images), - 'url': url, - } diff --git a/core/scripts/start_backend.sh b/core/scripts/start_backend.sh deleted file mode 100644 index 9963617..0000000 --- a/core/scripts/start_backend.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -o allexport -source ../.env -set +o allexport -exec uvicorn backend:app --reload --host localhost --port 8077 \ No newline at end of file diff --git a/core/scripts/start_pb.sh b/core/scripts/start_pb.sh deleted file mode 100644 index c577224..0000000 --- a/core/scripts/start_pb.sh +++ /dev/null @@ -1 +0,0 @@ -pb/pocketbase serve \ No newline at end of file diff --git a/core/scripts/start_tasks.sh b/core/scripts/start_tasks.sh deleted file mode 100644 index 164f3ca..0000000 --- a/core/scripts/start_tasks.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -set -o allexport -source ../.env -set +o allexport -exec python tasks.py \ No newline at end of file diff --git a/core/tasks.py b/core/tasks.py index f8248db..f214a59 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -1,26 +1,25 @@ import asyncio -from insights import pipeline, pb, logger +from general_process import crawler, pb, wiseflow_logger counter = 1 -async def process_site(site, counter): - if not site['per_hours'] or not site['url']: - return - if counter % site['per_hours'] == 0: - logger.info(f"applying {site['url']}") - await pipeline(site['url'].rstrip('/')) - - async def schedule_pipeline(interval): global counter while True: + wiseflow_logger.info(f'task execute loop {counter}') sites = pb.read('sites', filter='activated=True') - logger.info(f'task execute loop {counter}') - await asyncio.gather(*[process_site(site, counter) for site in sites]) + todo_urls = set() + for site in sites: + if not site['per_hours'] or not site['url']: + continue + if counter % site['per_hours'] == 0: + wiseflow_logger.info(f"applying {site['url']}") + todo_urls.add(site['url'].rstrip('/')) counter += 1 - logger.info(f'task execute loop finished, work after {interval} seconds') + await crawler.run(list(todo_urls)) + wiseflow_logger.info(f'task execute loop finished, work after {interval} seconds') await asyncio.sleep(interval) diff --git a/core/utils/general_utils.py b/core/utils/general_utils.py index 4840ed1..2e04e98 100644 --- a/core/utils/general_utils.py +++ b/core/utils/general_utils.py @@ -1,7 +1,8 @@ from urllib.parse import urlparse import os import re -import jieba +# import jieba +from loguru import logger def isURL(string): @@ -71,36 +72,28 @@ def extract_and_convert_dates(input_string): if matches: break if matches: - return ''.join(matches[0]) + return '-'.join(matches[0]) return None -def get_logger_level() -> str: - level_map = { - 'silly': 'CRITICAL', - 'verbose': 'DEBUG', - 'info': 'INFO', - 'warn': 'WARNING', - 'error': 'ERROR', - } - level: str = os.environ.get('WS_LOG', 'info').lower() - if level not in level_map: - raise ValueError( - 'WiseFlow LOG should support the values of `silly`, ' - '`verbose`, `info`, `warn`, `error`' - ) - return level_map.get(level, 'info') - +def get_logger(logger_name: str, logger_file_path: str): + level = 'DEBUG' if os.environ.get("VERBOSE", "").lower() in ["true", "1"] else 'INFO' + logger_file = os.path.join(logger_file_path, f"{logger_name}.log") + if not os.path.exists(logger_file_path): + os.makedirs(logger_file_path) + logger.add(logger_file, level=level, backtrace=True, diagnose=True, rotation="50 MB") + return logger +""" def compare_phrase_with_list(target_phrase, phrase_list, threshold): - """ + Compare the similarity of a target phrase to each phrase in the phrase list. : Param target_phrase: target phrase (str) : Param phrase_list: list of str : param threshold: similarity threshold (float) : Return: list of phrases that satisfy the similarity condition (list of str) - """ + if not target_phrase: return [] # The target phrase is empty, and the empty list is returned directly. @@ -112,3 +105,4 @@ def compare_phrase_with_list(target_phrase, phrase_list, threshold): if len(target_tokens & tokens) / min(len(target_tokens), len(tokens)) > threshold] return similar_phrases +""" \ No newline at end of file diff --git a/core/utils/pb_api.py b/core/utils/pb_api.py index 0346efe..7a07160 100644 --- a/core/utils/pb_api.py +++ b/core/utils/pb_api.py @@ -29,7 +29,8 @@ class PbTalker: def read(self, collection_name: str, fields: Optional[List[str]] = None, filter: str = '', skiptotal: bool = True) -> list: results = [] - for i in range(1, 10): + i = 1 + while True: try: res = self.client.collection(collection_name).get_list(i, 500, {"filter": filter, @@ -44,6 +45,7 @@ class PbTalker: for _res in res.items: attributes = vars(_res) results.append(attributes) + i += 1 return results def add(self, collection_name: str, body: Dict) -> str: diff --git a/dashboard/README.md b/dashboard/README.md index 644c128..687c586 100644 --- a/dashboard/README.md +++ b/dashboard/README.md @@ -1,71 +1,12 @@ -**Included Web Dashboard Example**: This is optional. If you only use the data processing functions or have your own downstream task program, you can ignore everything in this folder! +**预计在 V0.3.9 版本提供完整的用户侧api,目前这里只是参考** -## Main Features +API 并不直接与 Core 关联,api 也是针对数据存储(包含用户设置存储)进行操作,所以这里并不影响你直接使用 core。 -1.Daily Insights Display -2.Daily Article Display -3.Appending Search for Specific Hot Topics (using Sogou engine) -4.Generating Word Reports for Specific Hot Topics +初始版本 API 预计包含: -**Note: The code here cannot be used directly. It is adapted to an older version of the backend. You need to study the latest backend code in the `core` folder and make changes, especially in parts related to database integration!** - ------------------------------------------------------------------ - -附带的web Dashboard 示例,并非必须,如果你只是使用数据处理功能,或者你有自己的下游任务程序,可以忽略这个文件夹内的一切! - -## 主要功能 - -1. 每日insights展示 -2. 每日文章展示 -3. 指定热点追加搜索(使用sougou引擎) -4. 指定热点生成word报告 - -**注意:这里的代码并不能直接使用,它适配的是旧版本的后端程序,你需要研究core文件夹下的最新后端代码,进行更改,尤其是跟数据库对接的部分!** - ------------------------------------------------------------------ - -**付属のWebダッシュボードのサンプル**:これは必須ではありません。データ処理機能のみを使用する場合、または独自の下流タスクプログラムを持っている場合は、このフォルダ内のすべてを無視できます! - -## 主な機能 - -1. 毎日のインサイト表示 - -2. 毎日の記事表示 - -3. 特定のホットトピックの追加検索(Sogouエンジンを使用) - -4. 特定のホットトピックのWordレポートの生成 - -**注意:ここにあるコードは直接使用できません。古いバージョンのバックエンドに適合しています。`core`フォルダ内の最新のバックエンドコードを調べ、特にデータベースとの連携部分について変更を行う必要があります!** - ------------------------------------------------------------------ - -**Exemple de tableau de bord Web inclus** : Ceci est facultatif. Si vous n'utilisez que les fonctions de traitement des données ou si vous avez votre propre programme de tâches en aval, vous pouvez ignorer tout ce qui se trouve dans ce dossier ! - -## Fonctions principales - -1. Affichage des insights quotidiens - -2. Affichage des articles quotidiens - -3. Recherche supplémentaire pour des sujets populaires spécifiques (en utilisant le moteur Sogou) - -4. Génération de rapports Word pour des sujets populaires spécifiques - -**Remarque : Le code ici ne peut pas être utilisé directement. Il est adapté à une version plus ancienne du backend. Vous devez étudier le code backend le plus récent dans le dossier `core` et apporter des modifications, en particulier dans les parties relatives à l'intégration de la base de données !** - ------------------------------------------------------------------ - -**Beispiel eines enthaltenen Web-Dashboards**: Dies ist optional. Wenn Sie nur die Datenverarbeitungsfunktionen verwenden oder Ihr eigenes Downstream-Aufgabenprogramm haben, können Sie alles in diesem Ordner ignorieren! - -## Hauptfunktionen - -1. Tägliche Einblicke anzeigen - -2. Tägliche Artikel anzeigen - -3. Angehängte Suche nach spezifischen Hot Topics (unter Verwendung der Sogou-Suchmaschine) - -4. Erstellen von Word-Berichten für spezifische Hot Topics - -**Hinweis: Der Code hier kann nicht direkt verwendet werden. Er ist an eine ältere Version des Backends angepasst. Sie müssen den neuesten Backend-Code im `core`-Ordner studieren und Änderungen vornehmen, insbesondere in den Teilen, die die Datenbankintegration betreffen!** +- 信源的增删改查; +- 兴趣点的增删改查; +- insights 的读取和查找; +- 文章的读取和查找; +- 简单的报告生成功能; +- 原始资料的翻译等。 \ No newline at end of file diff --git a/dashboard/__init__.py b/dashboard/__init__.py index ced14f9..059593d 100644 --- a/dashboard/__init__.py +++ b/dashboard/__init__.py @@ -22,7 +22,7 @@ class BackendService: def report(self, insight_id: str, topics: list[str], comment: str) -> dict: logger.debug(f'got new report request insight_id {insight_id}') - insight = pb.read('insights', filter=f'id="{insight_id}"') + insight = pb.read('agents', filter=f'id="{insight_id}"') if not insight: logger.error(f'insight {insight_id} not found') return self.build_out(-2, 'insight not found') @@ -52,7 +52,7 @@ class BackendService: if flag: file = open(docx_file, 'rb') - message = pb.upload('insights', insight_id, 'docx', f'{insight_id}.docx', file) + message = pb.upload('agents', insight_id, 'docx', f'{insight_id}.docx', file) file.close() if message: logger.debug(f'report success finish and update to: {message}') @@ -143,7 +143,7 @@ class BackendService: def more_search(self, insight_id: str) -> dict: logger.debug(f'got search request for insight: {insight_id}') - insight = pb.read('insights', filter=f'id="{insight_id}"') + insight = pb.read('agents', filter=f'id="{insight_id}"') if not insight: logger.error(f'insight {insight_id} not found') return self.build_out(-2, 'insight not found') @@ -169,7 +169,7 @@ class BackendService: with open(os.path.join(self.cache_url, 'cache_articles.json'), 'a', encoding='utf-8') as f: json.dump(item, f, ensure_ascii=False, indent=4) - message = pb.update(collection_name='insights', id=insight_id, body={'articles': article_ids}) + message = pb.update(collection_name='agents', id=insight_id, body={'articles': article_ids}) if message: logger.debug(f'insight search success finish and update to: {message}') return self.build_out(11, insight_id) diff --git a/core/backend.py b/dashboard/backend.py similarity index 76% rename from core/backend.py rename to dashboard/backend.py index 298d55d..8eb3105 100644 --- a/core/backend.py +++ b/dashboard/backend.py @@ -2,9 +2,12 @@ from fastapi import FastAPI, BackgroundTasks from pydantic import BaseModel from typing import Literal, Optional from fastapi.middleware.cors import CORSMiddleware -from insights import message_manager +# backend的操作也应该是针对 pb 操作的,即添加信源、兴趣点等都应该存入 pb,而不是另起一个进程实例 +# 当然也可以放弃 pb,但那是另一个问题,数据和设置的管理应该是一套 +# 简单说用户侧(api dashboard等)和 core侧 不应该直接对接,都应该通过底层的data infrastructure 进行 + class Request(BaseModel): """ Input model diff --git a/env_sample b/env_sample index db77214..23313d8 100755 --- a/env_sample +++ b/env_sample @@ -1,10 +1,10 @@ export LLM_API_KEY="" -export LLM_API_BASE="https://api.siliconflow.cn/v1" ##for local model services or calling non-OpenAI services with openai_wrapper -##strongly recommended to use the following model provided by siliconflow (consider both effect and price) -export GET_INFO_MODEL="THUDM/glm-4-9b-chat" ## -export REWRITE_MODEL="Qwen/Qwen2-7B-Instruct" -export HTML_PARSE_MODEL="aQwen/Qwen2-7B-Instruct" +export LLM_API_BASE="https://api.siliconflow.cn/v1" +export PB_API_AUTH="test@example.com|1234567890" ##your pb superuser account and password + +##belowing is optional, go as you need +#export VERBOSE="true" ##for detail log info. If not need, remove this item. +#export PRIMARY_MODEL="Qwen/Qwen2.5-14B-Instruct" +#export SECONDARY_MODEL="THUDM/glm-4-9b-chat" export PROJECT_DIR="work_dir" -export PB_API_AUTH="test@example.com|1234567890" -# export "PB_API_BASE"="" ##only use if your pb not run on 127.0.0.1:8090 -export WS_LOG="verbose" ##for detail log info. If not need, just delete this item. \ No newline at end of file +#export PB_API_BASE="" ##only use if your pb not run on 127.0.0.1:8090 \ No newline at end of file diff --git a/core/pb/LICENSE.md b/pb/LICENSE.md similarity index 100% rename from core/pb/LICENSE.md rename to pb/LICENSE.md diff --git a/pb/README.md b/pb/README.md new file mode 100755 index 0000000..4cdf40c --- /dev/null +++ b/pb/README.md @@ -0,0 +1,9 @@ +download https://github.com/pocketbase/pocketbase/releases/download/v0.23.4/ + +```bash +cd pb +xattr -d com.apple.quarantine pocketbase # for Macos +./pocketbase migrate up # for first run +./pocketbase --dev admin create test@example.com 1234567890 # If you don't have an initial account, please use this command to create it +./pocketbase serve +``` \ No newline at end of file diff --git a/pb/pb_migrations/1733234529_created_focus_points.js b/pb/pb_migrations/1733234529_created_focus_points.js new file mode 100644 index 0000000..eb6b8c2 --- /dev/null +++ b/pb/pb_migrations/1733234529_created_focus_points.js @@ -0,0 +1,94 @@ +/// +migrate((app) => { + const collection = new Collection({ + "createRule": null, + "deleteRule": null, + "fields": [ + { + "autogeneratePattern": "[a-z0-9]{15}", + "hidden": false, + "id": "text3208210256", + "max": 15, + "min": 15, + "name": "id", + "pattern": "^[a-z0-9]+$", + "presentable": false, + "primaryKey": true, + "required": true, + "system": true, + "type": "text" + }, + { + "autogeneratePattern": "", + "hidden": false, + "id": "text2695655862", + "max": 0, + "min": 0, + "name": "focuspoint", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": false, + "system": false, + "type": "text" + }, + { + "autogeneratePattern": "", + "hidden": false, + "id": "text2284106510", + "max": 0, + "min": 0, + "name": "explanation", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": false, + "system": false, + "type": "text" + }, + { + "hidden": false, + "id": "bool806155165", + "name": "activated", + "presentable": false, + "required": false, + "system": false, + "type": "bool" + }, + { + "hidden": false, + "id": "autodate2990389176", + "name": "created", + "onCreate": true, + "onUpdate": false, + "presentable": false, + "system": false, + "type": "autodate" + }, + { + "hidden": false, + "id": "autodate3332085495", + "name": "updated", + "onCreate": true, + "onUpdate": true, + "presentable": false, + "system": false, + "type": "autodate" + } + ], + "id": "pbc_3385864241", + "indexes": [], + "listRule": null, + "name": "focus_points", + "system": false, + "type": "base", + "updateRule": null, + "viewRule": null + }); + + return app.save(collection); +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241"); + + return app.delete(collection); +}) diff --git a/pb/pb_migrations/1733234644_updated_focus_points.js b/pb/pb_migrations/1733234644_updated_focus_points.js new file mode 100644 index 0000000..1fce9b3 --- /dev/null +++ b/pb/pb_migrations/1733234644_updated_focus_points.js @@ -0,0 +1,42 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // update field + collection.fields.addAt(1, new Field({ + "autogeneratePattern": "", + "hidden": false, + "id": "text2695655862", + "max": 0, + "min": 0, + "name": "focuspoint", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": true, + "system": false, + "type": "text" + })) + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // update field + collection.fields.addAt(1, new Field({ + "autogeneratePattern": "", + "hidden": false, + "id": "text2695655862", + "max": 0, + "min": 0, + "name": "focuspoint", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": false, + "system": false, + "type": "text" + })) + + return app.save(collection) +}) diff --git a/pb/pb_migrations/1733465276_created_sites.js b/pb/pb_migrations/1733465276_created_sites.js new file mode 100644 index 0000000..164ff9b --- /dev/null +++ b/pb/pb_migrations/1733465276_created_sites.js @@ -0,0 +1,89 @@ +/// +migrate((app) => { + const collection = new Collection({ + "createRule": null, + "deleteRule": null, + "fields": [ + { + "autogeneratePattern": "[a-z0-9]{15}", + "hidden": false, + "id": "text3208210256", + "max": 15, + "min": 15, + "name": "id", + "pattern": "^[a-z0-9]+$", + "presentable": false, + "primaryKey": true, + "required": true, + "system": true, + "type": "text" + }, + { + "exceptDomains": [], + "hidden": false, + "id": "url4101391790", + "name": "url", + "onlyDomains": [], + "presentable": false, + "required": true, + "system": false, + "type": "url" + }, + { + "hidden": false, + "id": "number1152796692", + "max": null, + "min": null, + "name": "per_hours", + "onlyInt": false, + "presentable": false, + "required": false, + "system": false, + "type": "number" + }, + { + "hidden": false, + "id": "bool806155165", + "name": "activated", + "presentable": false, + "required": false, + "system": false, + "type": "bool" + }, + { + "hidden": false, + "id": "autodate2990389176", + "name": "created", + "onCreate": true, + "onUpdate": false, + "presentable": false, + "system": false, + "type": "autodate" + }, + { + "hidden": false, + "id": "autodate3332085495", + "name": "updated", + "onCreate": true, + "onUpdate": true, + "presentable": false, + "system": false, + "type": "autodate" + } + ], + "id": "pbc_2001081480", + "indexes": [], + "listRule": null, + "name": "sites", + "system": false, + "type": "base", + "updateRule": null, + "viewRule": null + }); + + return app.save(collection); +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_2001081480"); + + return app.delete(collection); +}) diff --git a/pb/pb_migrations/1733465563_created_infos.js b/pb/pb_migrations/1733465563_created_infos.js new file mode 100644 index 0000000..4ef607a --- /dev/null +++ b/pb/pb_migrations/1733465563_created_infos.js @@ -0,0 +1,98 @@ +/// +migrate((app) => { + const collection = new Collection({ + "createRule": null, + "deleteRule": null, + "fields": [ + { + "autogeneratePattern": "[a-z0-9]{15}", + "hidden": false, + "id": "text3208210256", + "max": 15, + "min": 15, + "name": "id", + "pattern": "^[a-z0-9]+$", + "presentable": false, + "primaryKey": true, + "required": true, + "system": true, + "type": "text" + }, + { + "autogeneratePattern": "", + "hidden": false, + "id": "text4274335913", + "max": 0, + "min": 0, + "name": "content", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": true, + "system": false, + "type": "text" + }, + { + "cascadeDelete": false, + "collectionId": "pbc_3385864241", + "hidden": false, + "id": "relation59357059", + "maxSelect": 1, + "minSelect": 0, + "name": "tag", + "presentable": false, + "required": false, + "system": false, + "type": "relation" + }, + { + "hidden": false, + "id": "file3291445124", + "maxSelect": 1, + "maxSize": 0, + "mimeTypes": [], + "name": "report", + "presentable": false, + "protected": false, + "required": false, + "system": false, + "thumbs": [], + "type": "file" + }, + { + "hidden": false, + "id": "autodate2990389176", + "name": "created", + "onCreate": true, + "onUpdate": false, + "presentable": false, + "system": false, + "type": "autodate" + }, + { + "hidden": false, + "id": "autodate3332085495", + "name": "updated", + "onCreate": true, + "onUpdate": true, + "presentable": false, + "system": false, + "type": "autodate" + } + ], + "id": "pbc_629947526", + "indexes": [], + "listRule": null, + "name": "infos", + "system": false, + "type": "base", + "updateRule": null, + "viewRule": null + }); + + return app.save(collection); +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_629947526"); + + return app.delete(collection); +}) diff --git a/pb/pb_migrations/1733753289_updated_infos.js b/pb/pb_migrations/1733753289_updated_infos.js new file mode 100644 index 0000000..2e98dbb --- /dev/null +++ b/pb/pb_migrations/1733753289_updated_infos.js @@ -0,0 +1,39 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_629947526") + + // add field + collection.fields.addAt(4, new Field({ + "exceptDomains": [], + "hidden": false, + "id": "url4101391790", + "name": "url", + "onlyDomains": [], + "presentable": false, + "required": true, + "system": false, + "type": "url" + })) + + // add field + collection.fields.addAt(5, new Field({ + "hidden": false, + "id": "file1486429761", + "maxSelect": 1, + "maxSize": 0, + "mimeTypes": [], + "name": "screenshot", + "presentable": false, + "protected": false, + "required": false, + "system": false, + "thumbs": [], + "type": "file" + })) + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_629947526") + + return app.save(collection) +}) diff --git a/pb/pb_migrations/1733753354_updated_focus_points.js b/pb/pb_migrations/1733753354_updated_focus_points.js new file mode 100644 index 0000000..9f64fc8 --- /dev/null +++ b/pb/pb_migrations/1733753354_updated_focus_points.js @@ -0,0 +1,42 @@ +/// +migrate((app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // update field + collection.fields.addAt(1, new Field({ + "autogeneratePattern": "", + "hidden": false, + "id": "text2695655862", + "max": 0, + "min": 0, + "name": "focuspoint", + "pattern": "", + "presentable": true, + "primaryKey": false, + "required": true, + "system": false, + "type": "text" + })) + + return app.save(collection) +}, (app) => { + const collection = app.findCollectionByNameOrId("pbc_3385864241") + + // update field + collection.fields.addAt(1, new Field({ + "autogeneratePattern": "", + "hidden": false, + "id": "text2695655862", + "max": 0, + "min": 0, + "name": "focuspoint", + "pattern": "", + "presentable": false, + "primaryKey": false, + "required": true, + "system": false, + "type": "text" + })) + + return app.save(collection) +}) diff --git a/version b/version index 937cd78..03f2afa 100644 --- a/version +++ b/version @@ -1 +1 @@ -v0.3.1 +v0.3.5