diff --git a/README.md b/README.md
index 16cc5ebeb..52caafc30 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
# SurfSense
-While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma and more to come.
+While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Slack, Linear, Jira, ClickUp, Confluence, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma and more to come.
@@ -62,6 +62,7 @@ Open source and easy to deploy locally.
### ℹ️ **External Sources**
- Search Engines (Tavily, LinkUp)
+- SearxNG (self-hosted instances)
- Slack
- Linear
- Jira
@@ -304,4 +305,3 @@ For detailed contribution guidelines, please see our [CONTRIBUTING.md](CONTRIBUT
---
---
-
diff --git a/docs/chinese-llm-setup.md b/docs/chinese-llm-setup.md
new file mode 100644
index 000000000..2a184608f
--- /dev/null
+++ b/docs/chinese-llm-setup.md
@@ -0,0 +1,323 @@
+# 国产 LLM 配置指南 | Chinese LLM Setup Guide
+
+本指南将帮助你在 SurfSense 中配置和使用国产大语言模型。
+
+This guide helps you configure and use Chinese LLM providers in SurfSense.
+
+---
+
+## 📋 支持的提供商 | Supported Providers
+
+SurfSense 现已支持以下国产 LLM:
+
+- ✅ **DeepSeek** - 国产高性能 AI 模型
+- ✅ **阿里通义千问 (Alibaba Qwen)** - 阿里云通义千问大模型
+- ✅ **月之暗面 Kimi (Moonshot)** - 月之暗面 Kimi 大模型
+- ✅ **智谱 AI GLM (Zhipu)** - 智谱 AI GLM 系列模型
+
+---
+
+## 🚀 快速开始 | Quick Start
+
+### 通用配置步骤 | General Configuration Steps
+
+1. 登录 SurfSense Dashboard
+2. 进入 **Settings** → **API Keys** (或 **LLM Configurations**)
+3. 点击 **Add New Configuration**
+4. 从 **Provider** 下拉菜单中选择你的国产 LLM 提供商
+5. 填写必填字段(见下方各提供商详细配置)
+6. 点击 **Save**
+
+---
+
+## 1️⃣ DeepSeek 配置 | DeepSeek Configuration
+
+### 获取 API Key
+
+1. 访问 [DeepSeek 开放平台](https://platform.deepseek.com/)
+2. 注册并登录账号
+3. 进入 **API Keys** 页面
+4. 点击 **Create New API Key**
+5. 复制生成的 API Key (格式: `sk-xxx`)
+
+### 在 SurfSense 中配置
+
+| 字段 | 值 | 说明 |
+|------|-----|------|
+| **Configuration Name** | `DeepSeek Chat` | 配置名称(自定义) |
+| **Provider** | `DEEPSEEK` | 选择 DeepSeek |
+| **Model Name** | `deepseek-chat` | 推荐模型 其他选项: `deepseek-coder` |
+| **API Key** | `sk-xxx...` | 你的 DeepSeek API Key |
+| **API Base URL** | `https://api.deepseek.com` | DeepSeek API 地址 |
+| **Parameters** | _(留空)_ | 使用默认参数 |
+
+### 示例配置
+
+```
+Configuration Name: DeepSeek Chat
+Provider: DEEPSEEK
+Model Name: deepseek-chat
+API Key: sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+API Base URL: https://api.deepseek.com
+```
+
+### 可用模型
+
+- **deepseek-chat**: 通用对话模型(推荐)
+- **deepseek-coder**: 代码专用模型
+
+### 定价
+- 请访问 [DeepSeek 定价页面](https://platform.deepseek.com/pricing) 查看最新价格
+
+---
+
+## 2️⃣ 阿里通义千问 (Alibaba Qwen) 配置
+
+### 获取 API Key
+
+1. 访问 [阿里云百炼平台](https://dashscope.aliyun.com/)
+2. 登录阿里云账号
+3. 开通 DashScope 服务
+4. 进入 **API-KEY 管理**
+5. 创建并复制 API Key
+
+### 在 SurfSense 中配置
+
+| 字段 | 值 | 说明 |
+|------|-----|------|
+| **Configuration Name** | `通义千问 Max` | 配置名称(自定义) |
+| **Provider** | `ALIBABA_QWEN` | 选择阿里通义千问 |
+| **Model Name** | `qwen-max` | 推荐模型 其他选项: `qwen-plus`, `qwen-turbo` |
+| **API Key** | `sk-xxx...` | 你的 DashScope API Key |
+| **API Base URL** | `https://dashscope.aliyuncs.com/compatible-mode/v1` | 阿里云 API 地址 |
+| **Parameters** | _(留空)_ | 使用默认参数 |
+
+### 示例配置
+
+```
+Configuration Name: 通义千问 Max
+Provider: ALIBABA_QWEN
+Model Name: qwen-max
+API Key: sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+API Base URL: https://dashscope.aliyuncs.com/compatible-mode/v1
+```
+
+### 可用模型
+
+- **qwen-max**: 最强性能,适合复杂任务
+- **qwen-plus**: 性价比高,适合日常使用(推荐)
+- **qwen-turbo**: 速度快,适合简单任务
+
+### 定价
+- 请访问 [阿里云百炼定价](https://help.aliyun.com/zh/model-studio/getting-started/billing) 查看最新价格
+
+---
+
+## 3️⃣ 月之暗面 Kimi (Moonshot) 配置
+
+### 获取 API Key
+
+1. 访问 [Moonshot AI 开放平台](https://platform.moonshot.cn/)
+2. 注册并登录账号
+3. 进入 **API Key 管理**
+4. 创建新的 API Key
+5. 复制 API Key
+
+### 在 SurfSense 中配置
+
+| 字段 | 值 | 说明 |
+|------|-----|------|
+| **Configuration Name** | `Kimi` | 配置名称(自定义) |
+| **Provider** | `MOONSHOT` | 选择月之暗面 Kimi |
+| **Model Name** | `moonshot-v1-32k` | 推荐模型 其他选项: `moonshot-v1-8k`, `moonshot-v1-128k` |
+| **API Key** | `sk-xxx...` | 你的 Moonshot API Key |
+| **API Base URL** | `https://api.moonshot.cn/v1` | Moonshot API 地址 |
+| **Parameters** | _(留空)_ | 使用默认参数 |
+
+### 示例配置
+
+```
+Configuration Name: Kimi 32K
+Provider: MOONSHOT
+Model Name: moonshot-v1-32k
+API Key: sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+API Base URL: https://api.moonshot.cn/v1
+```
+
+### 可用模型
+
+- **moonshot-v1-8k**: 8K 上下文(基础版)
+- **moonshot-v1-32k**: 32K 上下文(推荐)
+- **moonshot-v1-128k**: 128K 上下文(长文本专用)
+
+### 定价
+- 请访问 [Moonshot AI 定价](https://platform.moonshot.cn/pricing) 查看最新价格
+
+---
+
+## 4️⃣ 智谱 AI GLM (Zhipu) 配置
+
+### 获取 API Key
+
+1. 访问 [智谱 AI 开放平台](https://open.bigmodel.cn/)
+2. 注册并登录账号
+3. 进入 **API 管理**
+4. 创建新的 API Key
+5. 复制 API Key
+
+### 在 SurfSense 中配置
+
+| 字段 | 值 | 说明 |
+|------|-----|------|
+| **Configuration Name** | `GLM-4` | 配置名称(自定义) |
+| **Provider** | `ZHIPU` | 选择智谱 AI |
+| **Model Name** | `glm-4` | 推荐模型 其他选项: `glm-4-flash`, `glm-3-turbo` |
+| **API Key** | `xxx.yyy...` | 你的智谱 API Key |
+| **API Base URL** | `https://open.bigmodel.cn/api/paas/v4` | 智谱 API 地址 |
+| **Parameters** | _(留空)_ | 使用默认参数 |
+
+### 示例配置
+
+```
+Configuration Name: GLM-4
+Provider: ZHIPU
+Model Name: glm-4
+API Key: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxxxxx
+API Base URL: https://open.bigmodel.cn/api/paas/v4
+```
+
+### 可用模型
+
+- **glm-4**: GLM-4 旗舰模型(推荐)
+- **glm-4-flash**: 快速推理版本
+- **glm-3-turbo**: 高性价比版本
+
+### 定价
+- 请访问 [智谱 AI 定价](https://open.bigmodel.cn/pricing) 查看最新价格
+
+---
+
+## ⚙️ 高级配置 | Advanced Configuration
+
+### 自定义参数 | Custom Parameters
+
+你可以在 **Parameters** 字段中添加自定义参数(JSON 格式):
+
+```json
+{
+ "temperature": 0.7,
+ "max_tokens": 2000,
+ "top_p": 0.9
+}
+```
+
+### 常用参数说明
+
+| 参数 | 说明 | 默认值 | 范围 |
+|------|------|--------|------|
+| `temperature` | 控制输出随机性,越高越随机 | 0.7 | 0.0 - 1.0 |
+| `max_tokens` | 最大输出 Token 数 | 模型默认 | 1 - 模型上限 |
+| `top_p` | 核采样参数 | 1.0 | 0.0 - 1.0 |
+
+---
+
+## 🔧 故障排除 | Troubleshooting
+
+### 常见问题
+
+#### 1. **错误: "Invalid API Key"**
+- ✅ 检查 API Key 是否正确复制(无多余空格)
+- ✅ 确认 API Key 是否已激活
+- ✅ 检查账户余额是否充足
+
+#### 2. **错误: "Connection timeout"**
+- ✅ 确认 API Base URL 是否正确
+- ✅ 检查网络连接
+- ✅ 确认防火墙是否允许访问
+
+#### 3. **错误: "Model not found"**
+- ✅ 确认模型名称是否拼写正确
+- ✅ 检查该模型是否已开通
+- ✅ 参照上方文档确认可用模型名称
+
+#### 4. **文档处理卡住 (IN_PROGRESS)**
+- ✅ 检查模型名称中是否有多余空格
+- ✅ 确认 API Key 有效且有额度
+- ✅ 查看后端日志: `docker compose logs backend`
+
+### 查看日志
+
+```bash
+# 查看后端日志
+docker compose logs backend --tail 100
+
+# 实时查看日志
+docker compose logs -f backend
+
+# 搜索错误
+docker compose logs backend | grep -i "error"
+```
+
+---
+
+## 💡 最佳实践 | Best Practices
+
+### 1. 模型选择建议
+
+| 任务类型 | 推荐模型 | 说明 |
+|---------|---------|------|
+| **文档摘要** | Qwen-Plus, GLM-4 | 平衡性能和成本 |
+| **代码分析** | DeepSeek-Coder | 代码专用 |
+| **长文本处理** | Kimi 128K | 超长上下文 |
+| **快速响应** | Qwen-Turbo, GLM-4-Flash | 速度优先 |
+
+### 2. 成本优化
+
+- 🎯 **Long Context LLM**: 使用 Qwen-Plus 或 GLM-4(处理文档摘要)
+- ⚡ **Fast LLM**: 使用 Qwen-Turbo 或 GLM-4-Flash(快速对话)
+- 🧠 **Strategic LLM**: 使用 Qwen-Max 或 DeepSeek-Chat(复杂推理)
+
+### 3. API Key 安全
+
+- ❌ 不要在公开代码中硬编码 API Key
+- ✅ 定期轮换 API Key
+- ✅ 为不同用途创建不同的 Key
+- ✅ 设置合理的额度限制
+
+---
+
+## 📚 相关资源 | Resources
+
+### 官方文档
+
+- [DeepSeek 文档](https://platform.deepseek.com/docs)
+- [阿里云百炼文档](https://help.aliyun.com/zh/model-studio/)
+- [Moonshot AI 文档](https://platform.moonshot.cn/docs)
+- [智谱 AI 文档](https://open.bigmodel.cn/dev/api)
+
+### SurfSense 文档
+
+- [安装指南](../README.md)
+- [贡献指南](../CONTRIBUTING.md)
+- [部署指南](../DEPLOYMENT_GUIDE.md)
+
+---
+
+## 🆘 需要帮助? | Need Help?
+
+如果遇到问题,可以通过以下方式获取帮助:
+
+- 💬 [GitHub Issues](https://github.com/MODSetter/SurfSense/issues)
+- 💬 [Discord Community](https://discord.gg/ejRNvftDp9)
+- 📧 Email: [项目维护者邮箱]
+
+---
+
+## 🔄 更新日志 | Changelog
+
+- **2025-01-12**: 初始版本,添加 DeepSeek、Qwen、Kimi、GLM 支持
+
+---
+
+**祝你使用愉快!Happy coding with Chinese LLMs! 🚀**
+
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 1f2b89795..ad6c9d0c9 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -31,12 +31,13 @@ TTS_SERVICE_API_KEY=
# OPTIONAL: TTS Provider API Base
TTS_SERVICE_API_BASE=
-# LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
-STT_SERVICE=openai/whisper-1
-# Respective STT Service API
-STT_SERVICE_API_KEY=""
-# OPTIONAL: STT Provider API Base
-STT_SERVICE_API_BASE=
+# STT Service Configuration
+# For local Faster-Whisper: local/MODEL_SIZE (tiny, base, small, medium, large-v3)
+STT_SERVICE=local/base
+# For LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
+# STT_SERVICE=openai/whisper-1
+# STT_SERVICE_API_KEY=""
+# STT_SERVICE_API_BASE=
FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
diff --git a/surfsense_backend/alembic/env.py b/surfsense_backend/alembic/env.py
index fd9740ee2..bd8c20356 100644
--- a/surfsense_backend/alembic/env.py
+++ b/surfsense_backend/alembic/env.py
@@ -20,6 +20,11 @@ from app.db import Base # Assuming your Base is defined in app.db
# access to the values within the .ini file in use.
config = context.config
+# Override SQLAlchemy URL from environment variables when available
+database_url = os.getenv("DATABASE_URL")
+if database_url:
+ config.set_main_option("sqlalchemy.url", database_url)
+
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
diff --git a/surfsense_backend/alembic/versions/23_associate_connectors_with_search_spaces.py b/surfsense_backend/alembic/versions/23_associate_connectors_with_search_spaces.py
index 20e9d7840..a693b9ec5 100644
--- a/surfsense_backend/alembic/versions/23_associate_connectors_with_search_spaces.py
+++ b/surfsense_backend/alembic/versions/23_associate_connectors_with_search_spaces.py
@@ -2,7 +2,6 @@
Revision ID: '23'
Revises: '22'
-Create Date: 2025-01-10 12:00:00.000000
"""
diff --git a/surfsense_backend/alembic/versions/24_fix_null_chat_types.py b/surfsense_backend/alembic/versions/24_fix_null_chat_types.py
index 35313d27b..e0d371f1e 100644
--- a/surfsense_backend/alembic/versions/24_fix_null_chat_types.py
+++ b/surfsense_backend/alembic/versions/24_fix_null_chat_types.py
@@ -2,7 +2,6 @@
Revision ID: 24
Revises: 23
-Create Date: 2025-01-10 14:00:00.000000
"""
diff --git a/surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py b/surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py
index 116a3c687..c9966599c 100644
--- a/surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py
+++ b/surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py
@@ -2,7 +2,6 @@
Revision ID: 25
Revises: 24
-Create Date: 2025-01-10 14:00:00.000000
Changes:
1. Migrate llm_configs from user association to search_space association
diff --git a/surfsense_backend/alembic/versions/26_add_language_column_to_llm_configs.py b/surfsense_backend/alembic/versions/26_add_language_column_to_llm_configs.py
new file mode 100644
index 000000000..e5cdc37d7
--- /dev/null
+++ b/surfsense_backend/alembic/versions/26_add_language_column_to_llm_configs.py
@@ -0,0 +1,69 @@
+"""Add language column to llm_configs
+
+Revision ID: 26
+Revises: 25
+
+Changes:
+1. Add language column to llm_configs table with default value of 'English'
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "26"
+down_revision: str | None = "25"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+ """Add language column to llm_configs table."""
+
+ from sqlalchemy import inspect
+
+ conn = op.get_bind()
+ inspector = inspect(conn)
+
+ # Get existing columns
+ llm_config_columns = [col["name"] for col in inspector.get_columns("llm_configs")]
+
+ # Add language column if it doesn't exist
+ if "language" not in llm_config_columns:
+ op.add_column(
+ "llm_configs",
+ sa.Column(
+ "language",
+ sa.String(length=50),
+ nullable=True,
+ server_default="English",
+ ),
+ )
+
+ # Update existing rows to have 'English' as default
+ op.execute(
+ """
+ UPDATE llm_configs
+ SET language = 'English'
+ WHERE language IS NULL
+ """
+ )
+
+
+def downgrade() -> None:
+ """Remove language column from llm_configs table."""
+
+ from sqlalchemy import inspect
+
+ conn = op.get_bind()
+ inspector = inspect(conn)
+
+ # Get existing columns
+ llm_config_columns = [col["name"] for col in inspector.get_columns("llm_configs")]
+
+ # Drop language column if it exists
+ if "language" in llm_config_columns:
+ op.drop_column("llm_configs", "language")
diff --git a/surfsense_backend/alembic/versions/27_add_searxng_connector_enum.py b/surfsense_backend/alembic/versions/27_add_searxng_connector_enum.py
new file mode 100644
index 000000000..7834a8671
--- /dev/null
+++ b/surfsense_backend/alembic/versions/27_add_searxng_connector_enum.py
@@ -0,0 +1,41 @@
+"""Add SearxNG connector enum value
+
+Revision ID: 27
+Revises: 26
+Create Date: 2025-01-18 00:00:00.000000
+
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "27"
+down_revision: str | None = "26"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+ """Safely add SEARXNG_API to searchsourceconnectortype enum."""
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_type t
+ JOIN pg_enum e ON t.oid = e.enumtypid
+ WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'SEARXNG_API'
+ ) THEN
+ ALTER TYPE searchsourceconnectortype ADD VALUE 'SEARXNG_API';
+ END IF;
+ END
+ $$;
+ """
+ )
+
+
+def downgrade() -> None:
+ """Downgrade not supported for enum edits."""
+ pass
diff --git a/surfsense_backend/alembic/versions/28_add_chinese_litellmprovider_enum.py b/surfsense_backend/alembic/versions/28_add_chinese_litellmprovider_enum.py
new file mode 100644
index 000000000..af36c2ea1
--- /dev/null
+++ b/surfsense_backend/alembic/versions/28_add_chinese_litellmprovider_enum.py
@@ -0,0 +1,107 @@
+"""Add Chinese LLM providers to LiteLLMProvider enum
+
+Revision ID: 28
+Revises: 27
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "28"
+down_revision: str | None = "27"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+ """
+ Add Chinese LLM providers to LiteLLMProvider enum.
+
+ Adds support for:
+ - DEEPSEEK: DeepSeek AI models
+ - ALIBABA_QWEN: Alibaba Qwen models
+ - MOONSHOT: Moonshot AI models
+ - ZHIPU: Zhipu AI models
+ """
+
+ # Add DEEPSEEK to the enum if it doesn't already exist
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_enum
+ WHERE enumtypid = 'litellmprovider'::regtype
+ AND enumlabel = 'DEEPSEEK'
+ ) THEN
+ ALTER TYPE litellmprovider ADD VALUE 'DEEPSEEK';
+ END IF;
+ END$$;
+ """
+ )
+
+ # Add ALIBABA_QWEN to the enum if it doesn't already exist
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_enum
+ WHERE enumtypid = 'litellmprovider'::regtype
+ AND enumlabel = 'ALIBABA_QWEN'
+ ) THEN
+ ALTER TYPE litellmprovider ADD VALUE 'ALIBABA_QWEN';
+ END IF;
+ END$$;
+ """
+ )
+
+ # Add MOONSHOT to the enum if it doesn't already exist
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_enum
+ WHERE enumtypid = 'litellmprovider'::regtype
+ AND enumlabel = 'MOONSHOT'
+ ) THEN
+ ALTER TYPE litellmprovider ADD VALUE 'MOONSHOT';
+ END IF;
+ END$$;
+ """
+ )
+
+ # Add ZHIPU to the enum if it doesn't already exist
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_enum
+ WHERE enumtypid = 'litellmprovider'::regtype
+ AND enumlabel = 'ZHIPU'
+ ) THEN
+ ALTER TYPE litellmprovider ADD VALUE 'ZHIPU';
+ END IF;
+ END$$;
+ """
+ )
+
+
+def downgrade() -> None:
+ """
+ Remove Chinese LLM providers from LiteLLMProvider enum.
+
+ Note: PostgreSQL doesn't support removing enum values directly.
+ This would require recreating the enum type and updating all dependent objects.
+ For safety, this downgrade is a no-op.
+
+ """
+ # PostgreSQL doesn't support removing enum values directly
+ # This would require a complex migration recreating the enum
+ # PostgreSQL 不支持直接删除枚举值
+ # 这需要复杂的迁移来重建枚举
+ pass
diff --git a/surfsense_backend/alembic/versions/29_add_unique_identifier_hash_to_documents.py b/surfsense_backend/alembic/versions/29_add_unique_identifier_hash_to_documents.py
new file mode 100644
index 000000000..cf3486473
--- /dev/null
+++ b/surfsense_backend/alembic/versions/29_add_unique_identifier_hash_to_documents.py
@@ -0,0 +1,54 @@
+"""Add unique_identifier_hash column to documents table
+
+Revision ID: 29
+Revises: 28
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from sqlalchemy import inspect
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "29"
+down_revision: str | None = "28"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+ bind = op.get_bind()
+ inspector = inspect(bind)
+ columns = [col["name"] for col in inspector.get_columns("documents")]
+
+ # Only add the column if it doesn't already exist
+ if "unique_identifier_hash" not in columns:
+ op.add_column(
+ "documents",
+ sa.Column("unique_identifier_hash", sa.String(), nullable=True),
+ )
+ op.create_index(
+ op.f("ix_documents_unique_identifier_hash"),
+ "documents",
+ ["unique_identifier_hash"],
+ unique=False,
+ )
+ op.create_unique_constraint(
+ op.f("uq_documents_unique_identifier_hash"),
+ "documents",
+ ["unique_identifier_hash"],
+ )
+ else:
+ print(
+ "Column 'unique_identifier_hash' already exists. Skipping column creation."
+ )
+
+
+def downgrade() -> None:
+ op.drop_constraint(
+ op.f("uq_documents_unique_identifier_hash"), "documents", type_="unique"
+ )
+ op.drop_index(op.f("ix_documents_unique_identifier_hash"), table_name="documents")
+ op.drop_column("documents", "unique_identifier_hash")
diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py
index 3e81a59c0..24d8c819e 100644
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@@ -37,6 +37,7 @@ class Configuration:
search_mode: SearchMode
research_mode: ResearchMode
document_ids_to_add_in_context: list[int]
+ language: str | None = None
@classmethod
def from_runnable_config(
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index c1ad4890f..39ffc1218 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -71,9 +71,7 @@ def extract_sources_from_documents(
source = {
"id": doc.get("chunk_id", source_id_counter),
"title": document_info.get("title", "Untitled Document"),
- "description": doc.get("content", "")[:100] + "..."
- if len(doc.get("content", "")) > 100
- else doc.get("content", ""),
+ "description": doc.get("content", "").strip(),
"url": metadata.get("url", metadata.get("page_url", "")),
}
@@ -204,11 +202,7 @@ async def fetch_documents_by_ids(
title += f" ({issue_state})"
# Create description
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
if comment_count:
description += f" | Comments: {comment_count}"
@@ -229,11 +223,7 @@ async def fetch_documents_by_ids(
if message_date:
title += f" ({message_date})"
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
url = (
f"https://slack.com/app_redirect?channel={channel_id}"
if channel_id
@@ -246,11 +236,7 @@ async def fetch_documents_by_ids(
page_id = metadata.get("page_id", "")
title = f"Notion: {page_title}"
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
url = (
f"https://notion.so/{page_id.replace('-', '')}"
if page_id
@@ -261,11 +247,7 @@ async def fetch_documents_by_ids(
title = f"GitHub: {doc.title}"
description = metadata.get(
"description",
- (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- ),
+ (doc.content),
)
url = metadata.get("url", "")
@@ -281,11 +263,7 @@ async def fetch_documents_by_ids(
description = metadata.get(
"description",
- (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- ),
+ (doc.content),
)
url = (
f"https://www.youtube.com/watch?v={video_id}"
@@ -304,11 +282,7 @@ async def fetch_documents_by_ids(
if message_date:
title += f" ({message_date})"
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
if guild_id and channel_id:
url = f"https://discord.com/channels/{guild_id}/{channel_id}"
@@ -329,11 +303,7 @@ async def fetch_documents_by_ids(
if status:
title += f" ({status})"
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
if priority:
description += f" | Priority: {priority}"
if issue_type:
@@ -395,11 +365,7 @@ async def fetch_documents_by_ids(
except Exception:
pass
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
if location:
description += f" | Location: {location}"
if calendar_id and calendar_id != "primary":
@@ -437,11 +403,8 @@ async def fetch_documents_by_ids(
except Exception:
pass
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
+
if location_name:
description += f" | Venue: {location_name}"
elif meeting_url:
@@ -466,11 +429,7 @@ async def fetch_documents_by_ids(
)
title += f" (visited: {formatted_date})"
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
url = webpage_url
elif doc_type == "CRAWLED_URL":
@@ -479,11 +438,7 @@ async def fetch_documents_by_ids(
"og:description",
metadata.get(
"ogDescription",
- (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- ),
+ (doc.content),
),
)
url = metadata.get("url", "")
@@ -509,11 +464,8 @@ async def fetch_documents_by_ids(
else: # FILE and other types
title = doc.title
- description = (
- doc.content[:100] + "..."
- if len(doc.content) > 100
- else doc.content
- )
+ description = doc.content
+
url = metadata.get("url", "")
# Create source entry
@@ -598,6 +550,7 @@ async def write_answer_outline(
num_sections = configuration.num_sections
user_id = configuration.user_id
search_space_id = configuration.search_space_id
+ language = configuration.language # Get language from configuration
writer(
{
@@ -648,7 +601,7 @@ async def write_answer_outline(
# Create messages for the LLM
messages = [
- SystemMessage(content=get_answer_outline_system_prompt()),
+ SystemMessage(content=get_answer_outline_system_prompt(language=language)),
HumanMessage(content=human_message_content),
]
@@ -1052,6 +1005,30 @@ async def fetch_relevant_documents(
}
)
+ elif connector == "SEARXNG_API":
+ (
+ source_object,
+ searx_chunks,
+ ) = await connector_service.search_searxng(
+ user_query=reformulated_query,
+ user_id=user_id,
+ search_space_id=search_space_id,
+ top_k=top_k,
+ )
+
+ if source_object:
+ all_sources.append(source_object)
+ all_raw_documents.extend(searx_chunks)
+
+ if streaming_service and writer:
+ writer(
+ {
+ "yield_value": streaming_service.format_terminal_info_delta(
+ f"🌐 Found {len(searx_chunks)} SearxNG results related to your query"
+ )
+ }
+ )
+
elif connector == "LINKUP_API":
linkup_mode = "standard"
@@ -2047,6 +2024,7 @@ async def handle_qna_workflow(
"relevant_documents": all_documents, # Use combined documents
"user_id": configuration.user_id,
"search_space_id": configuration.search_space_id,
+ "language": configuration.language,
}
}
diff --git a/surfsense_backend/app/agents/researcher/prompts.py b/surfsense_backend/app/agents/researcher/prompts.py
index 44b218913..825772a24 100644
--- a/surfsense_backend/app/agents/researcher/prompts.py
+++ b/surfsense_backend/app/agents/researcher/prompts.py
@@ -1,9 +1,18 @@
import datetime
-def get_answer_outline_system_prompt():
+def _build_language_instruction(language: str | None = None):
+ if language:
+ return f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
+ return ""
+
+
+def get_answer_outline_system_prompt(language: str | None = None) -> str:
+ language_instruction = _build_language_instruction(language)
+
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+{language_instruction}
You are an expert research assistant specializing in structuring information. Your task is to create a detailed and logical research outline based on the user's query. This outline will serve as the blueprint for generating a comprehensive research report.
diff --git a/surfsense_backend/app/agents/researcher/qna_agent/configuration.py b/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
index 5a4529e0d..ea107a575 100644
--- a/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
@@ -20,6 +20,7 @@ class Configuration:
] # Documents provided directly to the agent for answering
user_id: str # User identifier
search_space_id: int # Search space identifier
+ language: str | None = None # Language for responses
@classmethod
def from_runnable_config(
diff --git a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
index fd6861efb..c4e79d685 100644
--- a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
@@ -102,7 +102,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
user_query = configuration.user_query
user_id = configuration.user_id
search_space_id = configuration.search_space_id
-
+ language = configuration.language
# Get user's fast LLM
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
if not llm:
@@ -127,7 +127,9 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
"""
# Use initial system prompt for token calculation
- initial_system_prompt = get_qna_citation_system_prompt(chat_history_str)
+ initial_system_prompt = get_qna_citation_system_prompt(
+ chat_history_str, language
+ )
base_messages = [
SystemMessage(content=initial_system_prompt),
HumanMessage(content=base_human_message_template),
@@ -146,9 +148,9 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
# Choose system prompt based on final document availability
system_prompt = (
- get_qna_citation_system_prompt(chat_history_str)
+ get_qna_citation_system_prompt(chat_history_str, language)
if has_documents
- else get_qna_no_documents_system_prompt(chat_history_str)
+ else get_qna_no_documents_system_prompt(chat_history_str, language)
)
# Generate documents section
diff --git a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
index 212788804..9c35f90cc 100644
--- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
@@ -1,7 +1,11 @@
import datetime
+from ..prompts import _build_language_instruction
-def get_qna_citation_system_prompt(chat_history: str | None = None):
+
+def get_qna_citation_system_prompt(
+ chat_history: str | None = None, language: str | None = None
+):
chat_history_section = (
f"""
@@ -16,9 +20,11 @@ NO CHAT HISTORY PROVIDED
"""
)
+ # Add language instruction if specified
+ language_instruction = _build_language_instruction(language)
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.
+You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
{chat_history_section}
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@@ -149,7 +155,9 @@ Make sure your response:
"""
-def get_qna_no_documents_system_prompt(chat_history: str | None = None):
+def get_qna_no_documents_system_prompt(
+ chat_history: str | None = None, language: str | None = None
+):
chat_history_section = (
f"""
@@ -164,9 +172,12 @@ NO CHAT HISTORY PROVIDED
"""
)
+ # Add language instruction if specified
+ language_instruction = _build_language_instruction(language)
+
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.
+You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
{chat_history_section}
The user has asked a question but there are no specific documents from their personal knowledge base available to answer it. You should provide a helpful response based on:
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index c3d487671..3c34eb474 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -1,7 +1,11 @@
import datetime
+from ..prompts import _build_language_instruction
-def get_citation_system_prompt(chat_history: str | None = None):
+
+def get_citation_system_prompt(
+ chat_history: str | None = None, language: str | None = None
+):
chat_history_section = (
f"""
@@ -16,9 +20,12 @@ NO CHAT HISTORY PROVIDED
"""
)
+ # Add language instruction if specified
+ language_instruction = _build_language_instruction(language)
+
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.
+You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.{language_instruction}
{chat_history_section}
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@@ -156,7 +163,9 @@ Make sure your response:
"""
-def get_no_documents_system_prompt(chat_history: str | None = None):
+def get_no_documents_system_prompt(
+ chat_history: str | None = None, language: str | None = None
+):
chat_history_section = (
f"""
@@ -171,9 +180,12 @@ NO CHAT HISTORY PROVIDED
"""
)
+ # Add language instruction if specified
+ language_instruction = _build_language_instruction(language)
+
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.
+You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.{language_instruction}
{chat_history_section}
You are writing content for a specific sub-section of a document. No specific documents from the user's personal knowledge base are available, so you should create content based on:
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 6a30839e9..acd1017e4 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -102,7 +102,7 @@ class Config:
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
TTS_SERVICE_API_KEY = os.getenv("TTS_SERVICE_API_KEY")
- # Litellm STT Configuration
+ # STT Configuration
STT_SERVICE = os.getenv("STT_SERVICE")
STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE")
STT_SERVICE_API_KEY = os.getenv("STT_SERVICE_API_KEY")
diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index 982dc8a67..36160c30b 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -379,43 +379,3 @@ class SlackHistory:
formatted["user_name"] = "Unknown"
return formatted
-
-
-# Example usage (uncomment to use):
-"""
-if __name__ == "__main__":
- # Set your token here or via environment variable
- token = os.environ.get("SLACK_API_TOKEN", "xoxb-your-token-here")
-
- slack = SlackHistory(token)
-
- # Get all channels
- try:
- channels = slack.get_all_channels()
- print("Available channels:")
- for name, channel_id in sorted(channels.items()):
- print(f"- {name}: {channel_id}")
-
- # Example: Get history for a specific channel and date range
- channel_id = channels.get("general")
- if channel_id:
- messages, error = slack.get_history_by_date_range(
- channel_id=channel_id,
- start_date="2023-01-01",
- end_date="2023-01-31",
- limit=500
- )
-
- if error:
- print(f"Error: {error}")
- else:
- print(f"\nRetrieved {len(messages)} messages from #general")
-
- # Print formatted messages
- for msg in messages[:10]: # Show first 10 messages
- formatted = slack.format_message(msg, include_user_info=True)
- print(f"[{formatted['datetime']}] {formatted['user_name']}: {formatted['text']}")
-
- except Exception as e:
- print(f"Error: {e}")
-"""
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 02d209d39..ee3a3e079 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -56,6 +56,7 @@ class DocumentType(str, Enum):
class SearchSourceConnectorType(str, Enum):
SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT
TAVILY_API = "TAVILY_API"
+ SEARXNG_API = "SEARXNG_API"
LINKUP_API = "LINKUP_API"
SLACK_CONNECTOR = "SLACK_CONNECTOR"
NOTION_CONNECTOR = "NOTION_CONNECTOR"
@@ -80,6 +81,11 @@ class ChatType(str, Enum):
class LiteLLMProvider(str, Enum):
+ """
+ Enum for LLM providers supported by LiteLLM.
+ LiteLLM 支持的 LLM 提供商枚举。
+ """
+
OPENAI = "OPENAI"
ANTHROPIC = "ANTHROPIC"
GROQ = "GROQ"
@@ -103,6 +109,11 @@ class LiteLLMProvider(str, Enum):
ALEPH_ALPHA = "ALEPH_ALPHA"
PETALS = "PETALS"
COMETAPI = "COMETAPI"
+ # Chinese LLM Providers (OpenAI-compatible)
+ DEEPSEEK = "DEEPSEEK"
+ ALIBABA_QWEN = "ALIBABA_QWEN"
+ MOONSHOT = "MOONSHOT"
+ ZHIPU = "ZHIPU"
CUSTOM = "CUSTOM"
@@ -165,6 +176,7 @@ class Document(BaseModel, TimestampMixin):
content = Column(Text, nullable=False)
content_hash = Column(String, nullable=False, index=True, unique=True)
+ unique_identifier_hash = Column(String, nullable=True, index=True, unique=True)
embedding = Column(Vector(config.embedding_model_instance.dimension))
search_space_id = Column(
@@ -298,6 +310,8 @@ class LLMConfig(BaseModel, TimestampMixin):
api_key = Column(String, nullable=False)
api_base = Column(String(500), nullable=True)
+ language = Column(String(50), nullable=True, default="English")
+
# For any other parameters that litellm supports
litellm_params = Column(JSON, nullable=True, default={})
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index e4d02686f..e003dc260 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -4,8 +4,9 @@ from langchain.schema import AIMessage, HumanMessage
from sqlalchemy.exc import IntegrityError, OperationalError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
+from sqlalchemy.orm import selectinload
-from app.db import Chat, SearchSpace, User, get_async_session
+from app.db import Chat, SearchSpace, User, UserSearchSpacePreference, get_async_session
from app.schemas import (
AISDKChatRequest,
ChatCreate,
@@ -53,10 +54,51 @@ async def handle_chat_data(
request_data.get("document_ids_to_add_in_context")
)
search_mode_str = validate_search_mode(request_data.get("search_mode"))
+ # print("RESQUEST DATA:", request_data)
+ # print("SELECTED CONNECTORS:", selected_connectors)
# Check if the search space belongs to the current user
try:
await check_ownership(session, SearchSpace, search_space_id, user)
+ language_result = await session.execute(
+ select(UserSearchSpacePreference)
+ .options(
+ selectinload(UserSearchSpacePreference.search_space).selectinload(
+ SearchSpace.llm_configs
+ ),
+ selectinload(UserSearchSpacePreference.long_context_llm),
+ selectinload(UserSearchSpacePreference.fast_llm),
+ selectinload(UserSearchSpacePreference.strategic_llm),
+ )
+ .filter(
+ UserSearchSpacePreference.search_space_id == search_space_id,
+ UserSearchSpacePreference.user_id == user.id,
+ )
+ )
+ user_preference = language_result.scalars().first()
+ # print("UserSearchSpacePreference:", user_preference)
+
+ language = None
+ if (
+ user_preference
+ and user_preference.search_space
+ and user_preference.search_space.llm_configs
+ ):
+ llm_configs = user_preference.search_space.llm_configs
+
+ for preferred_llm in [
+ user_preference.fast_llm,
+ user_preference.long_context_llm,
+ user_preference.strategic_llm,
+ ]:
+ if preferred_llm and getattr(preferred_llm, "language", None):
+ language = preferred_llm.language
+ break
+
+ if not language:
+ first_llm_config = llm_configs[0]
+ language = getattr(first_llm_config, "language", None)
+
except HTTPException:
raise HTTPException(
status_code=403, detail="You don't have access to this search space"
@@ -80,6 +122,7 @@ async def handle_chat_data(
langchain_chat_history,
search_mode_str,
document_ids_to_add_in_context,
+ language,
)
)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index dd7b56033..08a352e75 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -784,25 +784,68 @@ async def process_file_in_background(
{"file_type": "audio", "processing_stage": "starting_transcription"},
)
- # Open the audio file for transcription
- with open(file_path, "rb") as audio_file:
- # Use LiteLLM for audio transcription
- if app_config.STT_SERVICE_API_BASE:
- transcription_response = await atranscription(
- model=app_config.STT_SERVICE,
- file=audio_file,
- api_base=app_config.STT_SERVICE_API_BASE,
- api_key=app_config.STT_SERVICE_API_KEY,
+ # Determine STT service type
+ stt_service_type = (
+ "local"
+ if app_config.STT_SERVICE
+ and app_config.STT_SERVICE.startswith("local/")
+ else "external"
+ )
+
+ # Check if using local STT service
+ if stt_service_type == "local":
+ # Use local Faster-Whisper for transcription
+ from app.services.stt_service import stt_service
+
+ try:
+ result = stt_service.transcribe_file(file_path)
+ transcribed_text = result.get("text", "")
+
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
+
+ # Add metadata about the transcription
+ transcribed_text = (
+ f"# Transcription of {filename}\n\n{transcribed_text}"
)
- else:
+ except Exception as e:
+ raise HTTPException(
+ status_code=422,
+ detail=f"Failed to transcribe audio file {filename}: {e!s}",
+ ) from e
+
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Local STT transcription completed: {filename}",
+ {
+ "processing_stage": "local_transcription_complete",
+ "language": result.get("language"),
+ "confidence": result.get("language_probability"),
+ "duration": result.get("duration"),
+ },
+ )
+ else:
+ # Use LiteLLM for audio transcription
+ with open(file_path, "rb") as audio_file:
+ transcription_kwargs = {
+ "model": app_config.STT_SERVICE,
+ "file": audio_file,
+ "api_key": app_config.STT_SERVICE_API_KEY,
+ }
+ if app_config.STT_SERVICE_API_BASE:
+ transcription_kwargs["api_base"] = (
+ app_config.STT_SERVICE_API_BASE
+ )
+
transcription_response = await atranscription(
- model=app_config.STT_SERVICE,
- api_key=app_config.STT_SERVICE_API_KEY,
- file=audio_file,
+ **transcription_kwargs
)
- # Extract the transcribed text
- transcribed_text = transcription_response.get("text", "")
+ # Extract the transcribed text
+ transcribed_text = transcription_response.get("text", "")
+
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
# Add metadata about the transcription
transcribed_text = (
@@ -839,6 +882,7 @@ async def process_file_in_background(
"content_hash": result.content_hash,
"file_type": "audio",
"transcript_length": len(transcribed_text),
+ "stt_service": stt_service_type,
},
)
else:
@@ -1070,6 +1114,7 @@ async def process_file_in_background(
},
)
except Exception as e:
+ await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to process file: {filename}",
diff --git a/surfsense_backend/app/routes/llm_config_routes.py b/surfsense_backend/app/routes/llm_config_routes.py
index 63d540d2c..ec8ea5846 100644
--- a/surfsense_backend/app/routes/llm_config_routes.py
+++ b/surfsense_backend/app/routes/llm_config_routes.py
@@ -300,6 +300,9 @@ async def update_user_llm_preferences(
# Validate that all provided LLM config IDs belong to the search space
update_data = preferences.model_dump(exclude_unset=True)
+ # Store language from configs to validate consistency
+ languages = set()
+
for _key, llm_config_id in update_data.items():
if llm_config_id is not None:
# Verify the LLM config belongs to the search space
@@ -316,6 +319,16 @@ async def update_user_llm_preferences(
detail=f"LLM configuration {llm_config_id} not found in this search space",
)
+ # Collect language for consistency check
+ languages.add(llm_config.language)
+
+ # Check if all selected LLM configs have the same language
+ if len(languages) > 1:
+ raise HTTPException(
+ status_code=400,
+ detail="All selected LLM configurations must have the same language setting",
+ )
+
# Update user preferences
for key, value in update_data.items():
setattr(preference, key, value)
diff --git a/surfsense_backend/app/schemas/llm_config.py b/surfsense_backend/app/schemas/llm_config.py
index 8beb65347..285c15665 100644
--- a/surfsense_backend/app/schemas/llm_config.py
+++ b/surfsense_backend/app/schemas/llm_config.py
@@ -26,6 +26,9 @@ class LLMConfigBase(BaseModel):
litellm_params: dict[str, Any] | None = Field(
default=None, description="Additional LiteLLM parameters"
)
+ language: str | None = Field(
+ default="English", max_length=50, description="Language for the LLM"
+ )
class LLMConfigCreate(LLMConfigBase):
@@ -49,6 +52,9 @@ class LLMConfigUpdate(BaseModel):
api_base: str | None = Field(
None, max_length=500, description="Optional API base URL"
)
+ language: str | None = Field(
+ None, max_length=50, description="Language for the LLM"
+ )
litellm_params: dict[str, Any] | None = Field(
None, description="Additional LiteLLM parameters"
)
diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py
index f67246117..467354a17 100644
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@@ -1,6 +1,8 @@
import asyncio
from typing import Any
+from urllib.parse import urljoin
+import httpx
from linkup import LinkupClient
from sqlalchemy import func
from sqlalchemy.ext.asyncio import AsyncSession
@@ -113,7 +115,7 @@ class ConnectorService:
"title": document.get("title", "Untitled Document"),
"description": metadata.get(
"og:description",
- metadata.get("ogDescription", chunk.get("content", "")[:100]),
+ metadata.get("ogDescription", chunk.get("content", "")),
),
"url": metadata.get("url", ""),
}
@@ -187,7 +189,7 @@ class ConnectorService:
"title": document.get("title", "Untitled Document"),
"description": metadata.get(
"og:description",
- metadata.get("ogDescription", chunk.get("content", "")[:100]),
+ metadata.get("ogDescription", chunk.get("content", "")),
),
"url": metadata.get("url", ""),
}
@@ -328,7 +330,7 @@ class ConnectorService:
source = {
"id": self.source_id_counter,
"title": result.get("title", "Tavily Result"),
- "description": result.get("content", "")[:100],
+ "description": result.get("content", ""),
"url": result.get("url", ""),
}
sources_list.append(source)
@@ -372,6 +374,192 @@ class ConnectorService:
"sources": [],
}, []
+ async def search_searxng(
+ self,
+ user_query: str,
+ user_id: str,
+ search_space_id: int,
+ top_k: int = 20,
+ ) -> tuple:
+ """
+ Search using a configured SearxNG instance and return both sources and documents.
+ """
+ searx_connector = await self.get_connector_by_type(
+ user_id, SearchSourceConnectorType.SEARXNG_API, search_space_id
+ )
+
+ if not searx_connector:
+ return {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": [],
+ }, []
+
+ config = searx_connector.config or {}
+ host = config.get("SEARXNG_HOST")
+
+ if not host:
+ print("SearxNG connector is missing SEARXNG_HOST configuration")
+ return {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": [],
+ }, []
+
+ api_key = config.get("SEARXNG_API_KEY")
+ engines = config.get("SEARXNG_ENGINES")
+ categories = config.get("SEARXNG_CATEGORIES")
+ language = config.get("SEARXNG_LANGUAGE")
+ safesearch = config.get("SEARXNG_SAFESEARCH")
+
+ def _parse_bool(value: Any, default: bool = True) -> bool:
+ if isinstance(value, bool):
+ return value
+ if isinstance(value, str):
+ lowered = value.strip().lower()
+ if lowered in {"true", "1", "yes", "on"}:
+ return True
+ if lowered in {"false", "0", "no", "off"}:
+ return False
+ return default
+
+ verify_ssl = _parse_bool(config.get("SEARXNG_VERIFY_SSL", True))
+
+ safesearch_value: int | None = None
+ if isinstance(safesearch, str):
+ safesearch_clean = safesearch.strip()
+ if safesearch_clean.isdigit():
+ safesearch_value = int(safesearch_clean)
+ elif isinstance(safesearch, int | float):
+ safesearch_value = int(safesearch)
+
+ if safesearch_value is not None and not (0 <= safesearch_value <= 2):
+ safesearch_value = None
+
+ def _format_list(value: Any) -> str | None:
+ if value is None:
+ return None
+ if isinstance(value, str):
+ value = value.strip()
+ return value or None
+ if isinstance(value, list | tuple | set):
+ cleaned = [str(item).strip() for item in value if str(item).strip()]
+ return ",".join(cleaned) if cleaned else None
+ return str(value)
+
+ params: dict[str, Any] = {
+ "q": user_query,
+ "format": "json",
+ "language": language or "",
+ "limit": max(1, min(top_k, 50)),
+ }
+
+ engines_param = _format_list(engines)
+ if engines_param:
+ params["engines"] = engines_param
+
+ categories_param = _format_list(categories)
+ if categories_param:
+ params["categories"] = categories_param
+
+ if safesearch_value is not None:
+ params["safesearch"] = safesearch_value
+
+ if not params.get("language"):
+ params.pop("language")
+
+ headers = {"Accept": "application/json"}
+ if api_key:
+ headers["X-API-KEY"] = api_key
+
+ searx_endpoint = urljoin(host if host.endswith("/") else f"{host}/", "search")
+
+ try:
+ async with httpx.AsyncClient(timeout=20.0, verify=verify_ssl) as client:
+ response = await client.get(
+ searx_endpoint,
+ params=params,
+ headers=headers,
+ )
+ response.raise_for_status()
+ except httpx.HTTPError as exc:
+ print(f"Error searching with SearxNG: {exc!s}")
+ return {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": [],
+ }, []
+
+ try:
+ data = response.json()
+ except ValueError:
+ print("Failed to decode JSON response from SearxNG")
+ return {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": [],
+ }, []
+
+ searx_results = data.get("results", [])
+ if not searx_results:
+ return {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": [],
+ }, []
+
+ sources_list: list[dict[str, Any]] = []
+ documents: list[dict[str, Any]] = []
+
+ async with self.counter_lock:
+ for result in searx_results:
+ description = result.get("content") or result.get("snippet") or ""
+ if len(description) > 160:
+ description = f"{description}"
+
+ source = {
+ "id": self.source_id_counter,
+ "title": result.get("title", "SearxNG Result"),
+ "description": description,
+ "url": result.get("url", ""),
+ }
+ sources_list.append(source)
+
+ metadata = {
+ "url": result.get("url", ""),
+ "engines": result.get("engines", []),
+ "category": result.get("category"),
+ "source": "SEARXNG_API",
+ }
+
+ document = {
+ "chunk_id": self.source_id_counter,
+ "content": description or result.get("content", ""),
+ "score": result.get("score", 0.0),
+ "document": {
+ "id": self.source_id_counter,
+ "title": result.get("title", "SearxNG Result"),
+ "document_type": "SEARXNG_API",
+ "metadata": metadata,
+ },
+ }
+ documents.append(document)
+ self.source_id_counter += 1
+
+ result_object = {
+ "id": 11,
+ "name": "SearxNG Search",
+ "type": "SEARXNG_API",
+ "sources": sources_list,
+ }
+
+ return result_object, documents
+
async def search_slack(
self,
user_query: str,
@@ -433,9 +621,7 @@ class ConnectorService:
title += f" ({message_date})"
# Create a more descriptive description for Slack messages
- description = chunk.get("content", "")[:100]
- if len(description) == 100:
- description += "..."
+ description = chunk.get("content", "")
# For URL, we can use a placeholder or construct a URL to the Slack channel if available
url = ""
@@ -529,7 +715,7 @@ class ConnectorService:
title += f" (indexed: {indexed_at})"
# Create a more descriptive description for Notion pages
- description = chunk.get("content", "")[:100]
+ description = chunk.get("content", "")
if len(description) == 100:
description += "..."
@@ -641,7 +827,7 @@ class ConnectorService:
title += f" (visited: {visit_date})"
# Create a more descriptive description for extension data
- description = chunk.get("content", "")[:100]
+ description = chunk.get("content", "")
if len(description) == 100:
description += "..."
@@ -748,9 +934,7 @@ class ConnectorService:
title += f" - {channel_name}"
# Create a more descriptive description for YouTube videos
- description = metadata.get(
- "description", chunk.get("content", "")[:100]
- )
+ description = metadata.get("description", chunk.get("content", ""))
if len(description) == 100:
description += "..."
@@ -836,7 +1020,7 @@ class ConnectorService:
"title", "GitHub Document"
), # Use specific title if available
"description": metadata.get(
- "description", chunk.get("content", "")[:100]
+ "description", chunk.get("content", "")
), # Use description or content preview
"url": metadata.get("url", ""), # Use URL if available in metadata
}
@@ -922,7 +1106,7 @@ class ConnectorService:
title += f" ({issue_state})"
# Create a more descriptive description for Linear issues
- description = chunk.get("content", "")[:100]
+ description = chunk.get("content", "")
if len(description) == 100:
description += "..."
@@ -1034,7 +1218,7 @@ class ConnectorService:
title += f" ({status})"
# Create a more descriptive description for Jira issues
- description = chunk.get("content", "")[:100]
+ description = chunk.get("content", "")
if len(description) == 100:
description += "..."
@@ -1168,9 +1352,7 @@ class ConnectorService:
title += f" ({start_time})"
# Create a more descriptive description for calendar events
- description = chunk.get("content", "")[:100]
- if len(description) == 100:
- description += "..."
+ description = chunk.get("content", "")
# Add event info to description
info_parts = []
@@ -1385,9 +1567,7 @@ class ConnectorService:
title += f" (from {sender})"
# Create a more descriptive description for Gmail messages
- description = chunk.get("content", "")[:150]
- if len(description) == 150:
- description += "..."
+ description = chunk.get("content", "")
# Add message info to description
info_parts = []
@@ -1501,9 +1681,7 @@ class ConnectorService:
title += f" ({space_key})"
# Create a more descriptive description for Confluence pages
- description = chunk.get("content", "")[:100]
- if len(description) == 100:
- description += "..."
+ description = chunk.get("content", "")
# For URL, we can use a placeholder or construct a URL to the Confluence page if available
url = "" # TODO: Add base_url to metadata
@@ -1720,7 +1898,7 @@ class ConnectorService:
result.name if hasattr(result, "name") else "Linkup Result"
),
"description": (
- result.content[:100] if hasattr(result, "content") else ""
+ result.content if hasattr(result, "content") else ""
),
"url": result.url if hasattr(result, "url") else "",
}
@@ -1836,9 +2014,7 @@ class ConnectorService:
title += f" ({message_date})"
# Create a more descriptive description for Discord messages
- description = chunk.get("content", "")[:100]
- if len(description) == 100:
- description += "..."
+ description = chunk.get("content", "")
url = ""
guild_id = metadata.get("guild_id", "")
@@ -1955,10 +2131,7 @@ class ConnectorService:
except Exception:
title += f" ({start_time})"
- # Create a more descriptive description for Luma events
- description = chunk.get("content", "")[:150]
- if len(description) == 150:
- description += "..."
+ description = chunk.get("content", "")
# Add event info to description
info_parts = []
diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py
index d9299549c..e322eb401 100644
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@@ -83,11 +83,11 @@ async def get_user_llm_instance(
)
return None
- # Build the model string for litellm
+ # Build the model string for litellm / 构建 LiteLLM 的模型字符串
if llm_config.custom_provider:
model_string = f"{llm_config.custom_provider}/{llm_config.model_name}"
else:
- # Map provider enum to litellm format
+ # Map provider enum to litellm format / 将提供商枚举映射为 LiteLLM 格式
provider_map = {
"OPENAI": "openai",
"ANTHROPIC": "anthropic",
@@ -99,6 +99,11 @@ async def get_user_llm_instance(
"AZURE_OPENAI": "azure",
"OPENROUTER": "openrouter",
"COMETAPI": "cometapi",
+ # Chinese LLM providers (OpenAI-compatible)
+ "DEEPSEEK": "openai", # DeepSeek uses OpenAI-compatible API
+ "ALIBABA_QWEN": "openai", # Qwen uses OpenAI-compatible API
+ "MOONSHOT": "openai", # Moonshot (Kimi) uses OpenAI-compatible API
+ "ZHIPU": "openai", # Zhipu (GLM) uses OpenAI-compatible API
# Add more mappings as needed
}
provider_prefix = provider_map.get(
diff --git a/surfsense_backend/app/services/streaming_service.py b/surfsense_backend/app/services/streaming_service.py
index 40bd430cd..98c0d3ac5 100644
--- a/surfsense_backend/app/services/streaming_service.py
+++ b/surfsense_backend/app/services/streaming_service.py
@@ -66,7 +66,7 @@ class StreamingService:
for source in group.get("sources", []):
node = {
"id": str(source.get("id", "")),
- "text": source.get("description", ""),
+ "text": source.get("description", "").strip(),
"url": source.get("url", ""),
"metadata": {
"title": source.get("title", ""),
diff --git a/surfsense_backend/app/services/stt_service.py b/surfsense_backend/app/services/stt_service.py
new file mode 100644
index 000000000..ea38480e8
--- /dev/null
+++ b/surfsense_backend/app/services/stt_service.py
@@ -0,0 +1,100 @@
+"""Local Speech-to-Text service using Faster-Whisper."""
+
+import os
+import tempfile
+from pathlib import Path
+
+from faster_whisper import WhisperModel
+
+from app.config import config
+
+
+class STTService:
+ """Local Speech-to-Text service using Faster-Whisper."""
+
+ def __init__(self):
+ """Initialize STT service with model from STT_SERVICE config."""
+ # Parse model from STT_SERVICE (e.g., "local/base" or "local/tiny")
+ stt_service = config.STT_SERVICE or "local/base"
+ if stt_service.startswith("local/"):
+ self.model_size = stt_service.split("/", 1)[1]
+ else:
+ self.model_size = "base" # fallback
+ self._model: WhisperModel | None = None
+
+ def _get_model(self) -> WhisperModel:
+ """Lazy load the Whisper model."""
+ if self._model is None:
+ # Use CPU with optimizations for better performance
+ self._model = WhisperModel(
+ self.model_size,
+ device="cpu",
+ compute_type="int8", # Quantization for faster CPU inference
+ num_workers=1, # Single worker for stability
+ )
+ return self._model
+
+ def transcribe_file(self, audio_path: str, language: str | None = None) -> dict:
+ """Transcribe audio file to text.
+
+ Args:
+ audio_path: Path to audio file
+ language: Optional language code (e.g., "en", "es")
+
+ Returns:
+ Dict with transcription text and metadata
+ """
+ model = self._get_model()
+
+ # Transcribe with optimized settings
+ segments, info = model.transcribe(
+ audio_path,
+ language=language,
+ beam_size=1, # Faster inference
+ best_of=1, # Single pass
+ temperature=0, # Deterministic output
+ vad_filter=True, # Voice activity detection
+ vad_parameters={"min_silence_duration_ms": 500},
+ )
+
+ # Combine all segments
+ text = " ".join(segment.text.strip() for segment in segments)
+
+ return {
+ "text": text,
+ "language": info.language,
+ "language_probability": info.language_probability,
+ "duration": info.duration,
+ }
+
+ def transcribe_bytes(
+ self,
+ audio_bytes: bytes,
+ filename: str = "audio.wav",
+ language: str | None = None,
+ ) -> dict:
+ """Transcribe audio from bytes.
+
+ Args:
+ audio_bytes: Audio file bytes
+ filename: Original filename for format detection
+ language: Optional language code
+
+ Returns:
+ Dict with transcription text and metadata
+ """
+ # Save bytes to temporary file
+ suffix = Path(filename).suffix or ".wav"
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
+ tmp_file.write(audio_bytes)
+ tmp_path = tmp_file.name
+
+ try:
+ return self.transcribe_file(tmp_path, language)
+ finally:
+ # Clean up temp file
+ os.unlink(tmp_path)
+
+
+# Global STT service instance
+stt_service = STTService()
diff --git a/surfsense_backend/app/services/task_logging_service.py b/surfsense_backend/app/services/task_logging_service.py
index 39316b71f..6ba9d0432 100644
--- a/surfsense_backend/app/services/task_logging_service.py
+++ b/surfsense_backend/app/services/task_logging_service.py
@@ -1,3 +1,4 @@
+import contextlib
import logging
from datetime import datetime
from typing import Any
@@ -73,6 +74,14 @@ class TaskLoggingService:
Returns:
Log: The updated log entry
"""
+ # Ensure session is in a valid state
+ if not self.session.is_active:
+ await self.session.rollback()
+
+ # Refresh log_entry to avoid expired state
+ with contextlib.suppress(Exception):
+ await self.session.refresh(log_entry)
+
# Update the existing log entry
log_entry.status = LogStatus.SUCCESS
log_entry.message = message
@@ -114,6 +123,14 @@ class TaskLoggingService:
Returns:
Log: The updated log entry
"""
+ # Ensure session is in a valid state
+ if not self.session.is_active:
+ await self.session.rollback()
+
+ # Refresh log_entry to avoid expired state
+ with contextlib.suppress(Exception):
+ await self.session.refresh(log_entry)
+
# Update the existing log entry
log_entry.status = LogStatus.FAILED
log_entry.level = LogLevel.ERROR
@@ -161,6 +178,14 @@ class TaskLoggingService:
Returns:
Log: The updated log entry
"""
+ # Ensure session is in a valid state
+ if not self.session.is_active:
+ await self.session.rollback()
+
+ # Refresh log_entry to avoid expired state
+ with contextlib.suppress(Exception):
+ await self.session.refresh(log_entry)
+
log_entry.message = progress_message
if progress_metadata:
diff --git a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
index 0cc21bb47..b670391eb 100644
--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@@ -16,11 +16,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
calculate_date_range,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -240,25 +241,100 @@ async def index_airtable_records(
documents_skipped += 1
continue
+ record_id = record.get("id", "Unknown")
+
+ # Generate unique identifier hash for this Airtable record
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.AIRTABLE_CONNECTOR,
+ record_id,
+ search_space_id,
+ )
+
# Generate content hash
content_hash = generate_content_hash(
markdown_content, search_space_id
)
- # Check if document already exists
- existing_document_by_hash = (
- await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = (
+ await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for message {record.get('id')}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Airtable record {record_id} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Airtable record {record_id}. Updating document."
+ )
+ # Generate document summary
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "record_id": record_id,
+ "created_time": record.get(
+ "CREATED_TIME()", ""
+ ),
+ "document_type": "Airtable Record",
+ "connector_type": "Airtable",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ markdown_content,
+ user_llm,
+ document_metadata,
+ )
+ else:
+ summary_content = (
+ f"Airtable Record: {record_id}\n\n"
+ )
+ summary_embedding = (
+ config.embedding_model_instance.embed(
+ summary_content
+ )
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(
+ markdown_content
+ )
+
+ # Update existing document
+ existing_document.title = (
+ f"Airtable Record: {record_id}"
+ )
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "record_id": record_id,
+ "created_time": record.get(
+ "CREATED_TIME()", ""
+ ),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Airtable record {record_id}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate document summary
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -266,7 +342,7 @@ async def index_airtable_records(
if user_llm:
document_metadata = {
- "record_id": record.get("id", "Unknown"),
+ "record_id": record_id,
"created_time": record.get("CREATED_TIME()", ""),
"document_type": "Airtable Record",
"connector_type": "Airtable",
@@ -279,7 +355,7 @@ async def index_airtable_records(
)
else:
# Fallback to simple summary if no LLM configured
- summary_content = f"Airtable Record: {record.get('id', 'Unknown')}\n\n"
+ summary_content = f"Airtable Record: {record_id}\n\n"
summary_embedding = (
config.embedding_model_instance.embed(
summary_content
@@ -291,18 +367,19 @@ async def index_airtable_records(
# Create and store new document
logger.info(
- f"Creating new document for Airtable record: {record.get('id', 'Unknown')}"
+ f"Creating new document for Airtable record: {record_id}"
)
document = Document(
search_space_id=search_space_id,
- title=f"Airtable Record: {record.get('id', 'Unknown')}",
+ title=f"Airtable Record: {record_id}",
document_type=DocumentType.AIRTABLE_CONNECTOR,
document_metadata={
- "record_id": record.get("id", "Unknown"),
+ "record_id": record_id,
"created_time": record.get("CREATED_TIME()", ""),
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py
index 6d6f823e1..052ae3f4a 100644
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@@ -37,6 +37,30 @@ async def check_duplicate_document_by_hash(
return existing_doc_result.scalars().first()
+async def check_document_by_unique_identifier(
+ session: AsyncSession, unique_identifier_hash: str
+) -> Document | None:
+ """
+ Check if a document with the given unique identifier hash already exists.
+ Eagerly loads chunks to avoid lazy loading issues during updates.
+
+ Args:
+ session: Database session
+ unique_identifier_hash: Hash of the unique identifier from the source system
+
+ Returns:
+ Existing document if found, None otherwise
+ """
+ from sqlalchemy.orm import selectinload
+
+ existing_doc_result = await session.execute(
+ select(Document)
+ .options(selectinload(Document.chunks))
+ .where(Document.unique_identifier_hash == unique_identifier_hash)
+ )
+ return existing_doc_result.scalars().first()
+
+
async def get_connector_by_id(
session: AsyncSession, connector_id: int, connector_type: SearchSourceConnectorType
) -> SearchSourceConnector | None:
diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
index 5ee7342fa..4c057946b 100644
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@@ -16,10 +16,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -209,18 +210,92 @@ async def index_clickup_tasks(
documents_skipped += 1
continue
- # Hash for duplicates
- content_hash = generate_content_hash(task_content, search_space_id)
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Generate unique identifier hash for this ClickUp task
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.CLICKUP_CONNECTOR, task_id, search_space_id
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for task {task_name}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ # Generate content hash
+ content_hash = generate_content_hash(task_content, search_space_id)
+
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for ClickUp task {task_name} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for ClickUp task {task_name}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "task_id": task_id,
+ "task_name": task_name,
+ "task_status": task_status,
+ "task_priority": task_priority,
+ "task_list": task_list_name,
+ "task_space": task_space_name,
+ "assignees": len(task_assignees),
+ "document_type": "ClickUp Task",
+ "connector_type": "ClickUp",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ task_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = task_content
+ summary_embedding = (
+ config.embedding_model_instance.embed(task_content)
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(task_content)
+
+ # Update existing document
+ existing_document.title = f"Task - {task_name}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "task_id": task_id,
+ "task_name": task_name,
+ "task_status": task_status,
+ "task_priority": task_priority,
+ "task_assignees": task_assignees,
+ "task_due_date": task_due_date,
+ "task_created": task_created,
+ "task_updated": task_updated,
+ "indexed_at": datetime.now().strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated ClickUp task {task_name}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -270,6 +345,7 @@ async def index_clickup_tasks(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
index 28cb3b1f4..afdbdd177 100644
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@@ -16,11 +16,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
calculate_date_range,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -217,26 +218,97 @@ async def index_confluence_pages(
documents_skipped += 1
continue
+ # Generate unique identifier hash for this Confluence page
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.CONFLUENCE_CONNECTOR, page_id, search_space_id
+ )
+
# Generate content hash
content_hash = generate_content_hash(full_content, search_space_id)
- # Check if document already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ comment_count = len(comments)
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Confluence page {page_title} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Confluence page {page_title}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "page_title": page_title,
+ "page_id": page_id,
+ "space_id": space_id,
+ "comment_count": comment_count,
+ "document_type": "Confluence Page",
+ "connector_type": "Confluence",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ full_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
+ if page_content:
+ content_preview = page_content[:1000]
+ if len(page_content) > 1000:
+ content_preview += "..."
+ summary_content += (
+ f"Content Preview: {content_preview}\n\n"
+ )
+ summary_content += f"Comments: {comment_count}"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(full_content)
+
+ # Update existing document
+ existing_document.title = f"Confluence - {page_title}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "page_id": page_id,
+ "page_title": page_title,
+ "space_id": space_id,
+ "comment_count": comment_count,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Confluence page {page_title}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
- comment_count = len(comments)
if user_llm:
document_metadata = {
@@ -260,8 +332,8 @@ async def index_confluence_pages(
)
if page_content:
# Take first 500 characters of content for summary
- content_preview = page_content[:500]
- if len(page_content) > 500:
+ content_preview = page_content[:1000]
+ if len(page_content) > 1000:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
summary_content += f"Comments: {comment_count}"
@@ -287,6 +359,7 @@ async def index_confluence_pages(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
index 08c995f64..b08a36132 100644
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@@ -16,11 +16,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
build_document_metadata_string,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -307,23 +308,98 @@ async def index_discord_messages(
combined_document_string = build_document_metadata_string(
metadata_sections
)
+
+ # Generate unique identifier hash for this Discord channel
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
- # Skip duplicates by hash
- existing_document_by_hash = (
- await check_duplicate_document_by_hash(
- session, content_hash
- )
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for channel {guild_name}#{channel_name}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document."
+ )
+
+ # Get user's long context LLM
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+ if not user_llm:
+ logger.error(
+ f"No long context LLM configured for user {user_id}"
+ )
+ skipped_channels.append(
+ f"{guild_name}#{channel_name} (no LLM configured)"
+ )
+ documents_skipped += 1
+ continue
+
+ # Generate summary with metadata
+ document_metadata = {
+ "guild_name": guild_name,
+ "channel_name": channel_name,
+ "message_count": len(formatted_messages),
+ "document_type": "Discord Channel Messages",
+ "connector_type": "Discord",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ combined_document_string,
+ user_llm,
+ document_metadata,
+ )
+
+ # Chunks from channel content
+ chunks = await create_document_chunks(channel_content)
+
+ # Update existing document
+ existing_document.title = (
+ f"Discord - {guild_name}#{channel_name}"
+ )
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "guild_name": guild_name,
+ "guild_id": guild_id,
+ "channel_name": channel_name,
+ "channel_id": channel_id,
+ "message_count": len(formatted_messages),
+ "start_date": start_date_iso,
+ "end_date": end_date_iso,
+ "indexed_at": datetime.now(UTC).strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Discord channel {guild_name}#{channel_name}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -375,6 +451,7 @@ async def index_discord_messages(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
index 9cc0c0993..8cd8ca299 100644
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@@ -16,10 +16,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
)
@@ -199,19 +200,101 @@ async def index_github_repos(
)
continue # Skip if content fetch failed
- content_hash = generate_content_hash(file_content, search_space_id)
-
- # Check if document with this content hash already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Generate unique identifier hash for this GitHub file
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.GITHUB_CONNECTOR, file_sha, search_space_id
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing."
- )
- continue
+ # Generate content hash
+ content_hash = generate_content_hash(file_content, search_space_id)
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for GitHub file {full_path_key} unchanged. Skipping."
+ )
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for GitHub file {full_path_key}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+ if user_llm:
+ file_extension = (
+ file_path.split(".")[-1]
+ if "." in file_path
+ else None
+ )
+ document_metadata = {
+ "file_path": full_path_key,
+ "repository": repo_full_name,
+ "file_type": file_extension or "unknown",
+ "document_type": "GitHub Repository File",
+ "connector_type": "GitHub",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ file_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..."
+ summary_embedding = (
+ config.embedding_model_instance.embed(
+ summary_content
+ )
+ )
+
+ # Chunk the content
+ try:
+ if hasattr(config, "code_chunker_instance"):
+ chunks_data = [
+ await create_document_chunks(file_content)
+ ][0]
+ else:
+ chunks_data = await create_document_chunks(
+ file_content
+ )
+ except Exception as chunk_err:
+ logger.error(
+ f"Failed to chunk file {full_path_key}: {chunk_err}"
+ )
+ continue
+
+ # Update existing document
+ existing_document.title = f"GitHub - {full_path_key}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "file_path": file_path,
+ "file_sha": file_sha,
+ "file_url": file_url,
+ "repository": repo_full_name,
+ "indexed_at": datetime.now(UTC).strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ),
+ }
+ existing_document.chunks = chunks_data
+
+ logger.info(
+ f"Successfully updated GitHub file {full_path_key}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -290,6 +373,7 @@ async def index_github_repos(
document_metadata=doc_metadata,
content=summary_content, # Store summary
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data, # Associate chunks directly
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index be5169612..b7d8e0b59 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -17,9 +17,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -248,23 +250,99 @@ async def index_google_calendar_events(
location = event.get("location", "")
description = event.get("description", "")
+ # Generate unique identifier hash for this Google Calendar event
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(event_markdown, search_space_id)
- # Duplicate check via simple query using helper in base
- from .base import (
- check_duplicate_document_by_hash, # local import to avoid circular at module import
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
- )
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for event {event_summary}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Google Calendar event {event_summary} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Google Calendar event {event_summary}. Updating document."
+ )
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "event_id": event_id,
+ "event_summary": event_summary,
+ "calendar_id": calendar_id,
+ "start_time": start_time,
+ "end_time": end_time,
+ "location": location or "No location",
+ "document_type": "Google Calendar Event",
+ "connector_type": "Google Calendar",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ event_markdown, user_llm, document_metadata
+ )
+ else:
+ summary_content = (
+ f"Google Calendar Event: {event_summary}\n\n"
+ )
+ summary_content += f"Calendar: {calendar_id}\n"
+ summary_content += f"Start: {start_time}\n"
+ summary_content += f"End: {end_time}\n"
+ if location:
+ summary_content += f"Location: {location}\n"
+ if description:
+ desc_preview = description[:1000]
+ if len(description) > 1000:
+ desc_preview += "..."
+ summary_content += f"Description: {desc_preview}\n"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(event_markdown)
+
+ # Update existing document
+ existing_document.title = f"Calendar Event - {event_summary}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "event_id": event_id,
+ "event_summary": event_summary,
+ "calendar_id": calendar_id,
+ "start_time": start_time,
+ "end_time": end_time,
+ "location": location,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Google Calendar event {event_summary}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -296,8 +374,8 @@ async def index_google_calendar_events(
if location:
summary_content += f"Location: {location}\n"
if description:
- desc_preview = description[:300]
- if len(description) > 300:
+ desc_preview = description[:1000]
+ if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
@@ -320,6 +398,7 @@ async def index_google_calendar_events(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
index 872e19d03..9d3823741 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@@ -21,10 +21,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -194,21 +195,85 @@ async def index_google_gmail_messages(
documents_skipped += 1
continue
+ # Generate unique identifier hash for this Gmail message
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id
+ )
+
# Generate content hash
content_hash = generate_content_hash(markdown_content, search_space_id)
- # Check if document already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for message {message_id}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Gmail message {subject} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Gmail message {subject}. Updating document."
+ )
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "message_id": message_id,
+ "thread_id": thread_id,
+ "subject": subject,
+ "sender": sender,
+ "date": date_str,
+ "document_type": "Gmail Message",
+ "connector_type": "Google Gmail",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ markdown_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"Google Gmail Message: {subject}\n\n"
+ summary_content += f"Sender: {sender}\n"
+ summary_content += f"Date: {date_str}\n"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(markdown_content)
+
+ # Update existing document
+ existing_document.title = f"Gmail: {subject}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "message_id": message_id,
+ "thread_id": thread_id,
+ "subject": subject,
+ "sender": sender,
+ "date": date_str,
+ "connector_id": connector_id,
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(f"Successfully updated Gmail message {subject}")
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -258,6 +323,7 @@ async def index_google_gmail_messages(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
index e9d556954..36e09c81e 100644
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@@ -16,11 +16,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
calculate_date_range,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -200,26 +201,96 @@ async def index_jira_issues(
documents_skipped += 1
continue
+ # Generate unique identifier hash for this Jira issue
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.JIRA_CONNECTOR, issue_id, search_space_id
+ )
+
# Generate content hash
content_hash = generate_content_hash(issue_content, search_space_id)
- # Check if document already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ comment_count = len(formatted_issue.get("comments", []))
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Jira issue {issue_identifier} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Jira issue {issue_identifier}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "issue_key": issue_identifier,
+ "issue_title": issue_title,
+ "status": formatted_issue.get("status", "Unknown"),
+ "priority": formatted_issue.get("priority", "Unknown"),
+ "comment_count": comment_count,
+ "document_type": "Jira Issue",
+ "connector_type": "Jira",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ issue_content, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
+ if formatted_issue.get("description"):
+ summary_content += f"Description: {formatted_issue.get('description')}\n\n"
+ summary_content += f"Comments: {comment_count}"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(issue_content)
+
+ # Update existing document
+ existing_document.title = (
+ f"Jira - {issue_identifier}: {issue_title}"
+ )
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "issue_id": issue_id,
+ "issue_identifier": issue_identifier,
+ "issue_title": issue_title,
+ "state": formatted_issue.get("status", "Unknown"),
+ "comment_count": comment_count,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Jira issue {issue_identifier}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
- comment_count = len(formatted_issue.get("comments", []))
if user_llm:
document_metadata = {
@@ -270,6 +341,7 @@ async def index_jira_issues(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
index aca1e2040..33d5835ee 100644
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@@ -16,11 +16,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
calculate_date_range,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -213,27 +214,101 @@ async def index_linear_issues(
documents_skipped += 1
continue
- content_hash = generate_content_hash(issue_content, search_space_id)
-
- # Check if document with this content hash already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Generate unique identifier hash for this Linear issue
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.LINEAR_CONNECTOR, issue_id, search_space_id
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ # Generate content hash
+ content_hash = generate_content_hash(issue_content, search_space_id)
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ state = formatted_issue.get("state", "Unknown")
+ description = formatted_issue.get("description", "")
+ comment_count = len(formatted_issue.get("comments", []))
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Linear issue {issue_identifier} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Linear issue {issue_identifier}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "issue_id": issue_identifier,
+ "issue_title": issue_title,
+ "state": state,
+ "priority": formatted_issue.get("priority", "Unknown"),
+ "comment_count": comment_count,
+ "document_type": "Linear Issue",
+ "connector_type": "Linear",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ issue_content, user_llm, document_metadata
+ )
+ else:
+ # Fallback to simple summary if no LLM configured
+ if description and len(description) > 1000:
+ description = description[:997] + "..."
+ summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
+ if description:
+ summary_content += f"Description: {description}\n\n"
+ summary_content += f"Comments: {comment_count}"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(issue_content)
+
+ # Update existing document
+ existing_document.title = (
+ f"Linear - {issue_identifier}: {issue_title}"
+ )
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "issue_id": issue_id,
+ "issue_identifier": issue_identifier,
+ "issue_title": issue_title,
+ "state": state,
+ "comment_count": comment_count,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(
+ f"Successfully updated Linear issue {issue_identifier}"
+ )
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
- state = formatted_issue.get("state", "Unknown")
- description = formatted_issue.get("description", "")
- comment_count = len(formatted_issue.get("comments", []))
if user_llm:
document_metadata = {
@@ -254,8 +329,8 @@ async def index_linear_issues(
else:
# Fallback to simple summary if no LLM configured
# Truncate description if it's too long for the summary
- if description and len(description) > 500:
- description = description[:497] + "..."
+ if description and len(description) > 1000:
+ description = description[:997] + "..."
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
@@ -285,6 +360,7 @@ async def index_linear_issues(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
index 3d8970654..15588afaa 100644
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@@ -16,9 +16,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -254,21 +256,108 @@ async def index_luma_events(
description = event_data.get("description", "")
cover_url = event_data.get("cover_url", "")
+ # Generate unique identifier hash for this Luma event
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.LUMA_CONNECTOR, event_id, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(event_markdown, search_space_id)
- # Duplicate check via simple query using helper in base
- from .base import check_duplicate_document_by_hash
-
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for event {event_name}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Luma event {event_name} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Luma event {event_name}. Updating document."
+ )
+
+ # Generate summary with metadata
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+
+ if user_llm:
+ document_metadata = {
+ "event_id": event_id,
+ "event_name": event_name,
+ "event_url": event_url,
+ "start_at": start_at,
+ "end_at": end_at,
+ "timezone": timezone,
+ "location": location or "No location",
+ "city": city,
+ "hosts": host_names,
+ "document_type": "Luma Event",
+ "connector_type": "Luma",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ event_markdown, user_llm, document_metadata
+ )
+ else:
+ summary_content = f"Luma Event: {event_name}\n\n"
+ if event_url:
+ summary_content += f"URL: {event_url}\n"
+ summary_content += f"Start: {start_at}\n"
+ summary_content += f"End: {end_at}\n"
+ if timezone:
+ summary_content += f"Timezone: {timezone}\n"
+ if location:
+ summary_content += f"Location: {location}\n"
+ if city:
+ summary_content += f"City: {city}\n"
+ if host_names:
+ summary_content += f"Hosts: {host_names}\n"
+ if description:
+ desc_preview = description[:1000]
+ if len(description) > 1000:
+ desc_preview += "..."
+ summary_content += f"Description: {desc_preview}\n"
+ summary_embedding = config.embedding_model_instance.embed(
+ summary_content
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(event_markdown)
+
+ # Update existing document
+ existing_document.title = f"Luma Event - {event_name}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "event_id": event_id,
+ "event_name": event_name,
+ "event_url": event_url,
+ "start_at": start_at,
+ "end_at": end_at,
+ "timezone": timezone,
+ "location": location,
+ "city": city,
+ "hosts": host_names,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(f"Successfully updated Luma event {event_name}")
+ continue
+
+ # Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -310,8 +399,8 @@ async def index_luma_events(
if host_names:
summary_content += f"Hosts: {host_names}\n"
if description:
- desc_preview = description[:300]
- if len(description) > 300:
+ desc_preview = description[:1000]
+ if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
@@ -340,6 +429,7 @@ async def index_luma_events(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
index b290f86da..699d2fddf 100644
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@@ -15,11 +15,12 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
build_document_metadata_string,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -282,22 +283,82 @@ async def index_notion_pages(
combined_document_string = build_document_metadata_string(
metadata_sections
)
+
+ # Generate unique identifier hash for this Notion page
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.NOTION_CONNECTOR, page_id, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
- # Check if document with this content hash already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Notion page {page_title} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Notion page {page_title}. Updating document."
+ )
+ # Get user's long context LLM
+ user_llm = await get_user_long_context_llm(
+ session, user_id, search_space_id
+ )
+ if not user_llm:
+ logger.error(
+ f"No long context LLM configured for user {user_id}"
+ )
+ skipped_pages.append(f"{page_title} (no LLM configured)")
+ documents_skipped += 1
+ continue
+
+ # Generate summary with metadata
+ document_metadata = {
+ "page_title": page_title,
+ "page_id": page_id,
+ "document_type": "Notion Page",
+ "connector_type": "Notion",
+ }
+ (
+ summary_content,
+ summary_embedding,
+ ) = await generate_document_summary(
+ markdown_content, user_llm, document_metadata
+ )
+
+ # Process chunks
+ chunks = await create_document_chunks(markdown_content)
+
+ # Update existing document
+ existing_document.title = f"Notion - {page_title}"
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
+ "page_title": page_title,
+ "page_id": page_id,
+ "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ }
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(f"Successfully updated Notion page: {page_title}")
+ continue
+
+ # Document doesn't exist - create new one
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
@@ -336,6 +397,7 @@ async def index_notion_pages(
},
content=summary_content,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
index fb6dac9c5..dd9edcc8d 100644
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@@ -15,12 +15,13 @@ from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
+ generate_unique_identifier_hash,
)
from .base import (
build_document_metadata_markdown,
calculate_date_range,
- check_duplicate_document_by_hash,
+ check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
@@ -235,6 +236,7 @@ async def index_slack_messages(
for msg in formatted_messages:
timestamp = msg.get("datetime", "Unknown Time")
+ msg_ts = msg.get("ts", timestamp) # Get original Slack timestamp
msg_user_name = msg.get("user_name", "Unknown User")
msg_user_email = msg.get("user_email", "Unknown Email")
msg_text = msg.get("text", "")
@@ -261,22 +263,68 @@ async def index_slack_messages(
combined_document_string = build_document_metadata_markdown(
metadata_sections
)
+
+ # Generate unique identifier hash for this Slack message
+ unique_identifier = f"{channel_id}_{msg_ts}"
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.SLACK_CONNECTOR, unique_identifier, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(
combined_document_string, search_space_id
)
- # Check if document with this content hash already exists
- existing_document_by_hash = await check_duplicate_document_by_hash(
- session, content_hash
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
)
- if existing_document_by_hash:
- logger.info(
- f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing."
- )
- documents_skipped += 1
- continue
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logger.info(
+ f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping."
+ )
+ documents_skipped += 1
+ continue
+ else:
+ # Content has changed - update the existing document
+ logger.info(
+ f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document."
+ )
+ # Update chunks and embedding
+ chunks = await create_document_chunks(
+ combined_document_string
+ )
+ doc_embedding = config.embedding_model_instance.embed(
+ combined_document_string
+ )
+
+ # Update existing document
+ existing_document.content = combined_document_string
+ existing_document.content_hash = content_hash
+ existing_document.embedding = doc_embedding
+ existing_document.document_metadata = {
+ "channel_name": channel_name,
+ "channel_id": channel_id,
+ "start_date": start_date_str,
+ "end_date": end_date_str,
+ "message_count": len(formatted_messages),
+ "indexed_at": datetime.now().strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ),
+ }
+
+ # Delete old chunks and add new ones
+ existing_document.chunks = chunks
+
+ documents_indexed += 1
+ logger.info(f"Successfully updated Slack message {msg_ts}")
+ continue
+
+ # Document doesn't exist - create new one
# Process chunks
chunks = await create_document_chunks(combined_document_string)
doc_embedding = config.embedding_model_instance.embed(
@@ -300,6 +348,7 @@ async def index_slack_messages(
embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
)
session.add(document)
diff --git a/surfsense_backend/app/tasks/document_processors/base.py b/surfsense_backend/app/tasks/document_processors/base.py
index d5b1722fb..b3c08fec3 100644
--- a/surfsense_backend/app/tasks/document_processors/base.py
+++ b/surfsense_backend/app/tasks/document_processors/base.py
@@ -29,3 +29,27 @@ async def check_duplicate_document(
select(Document).where(Document.content_hash == content_hash)
)
return existing_doc_result.scalars().first()
+
+
+async def check_document_by_unique_identifier(
+ session: AsyncSession, unique_identifier_hash: str
+) -> Document | None:
+ """
+ Check if a document with the given unique identifier hash already exists.
+ Eagerly loads chunks to avoid lazy loading issues during updates.
+
+ Args:
+ session: Database session
+ unique_identifier_hash: Hash of the unique identifier from the source
+
+ Returns:
+ Existing document if found, None otherwise
+ """
+ from sqlalchemy.orm import selectinload
+
+ existing_doc_result = await session.execute(
+ select(Document)
+ .options(selectinload(Document.chunks))
+ .where(Document.unique_identifier_hash == unique_identifier_hash)
+ )
+ return existing_doc_result.scalars().first()
diff --git a/surfsense_backend/app/tasks/document_processors/extension_processor.py b/surfsense_backend/app/tasks/document_processors/extension_processor.py
index ed25b8fbd..663093375 100644
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@@ -15,10 +15,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document,
+ check_document_by_unique_identifier,
)
@@ -85,25 +86,42 @@ async def add_extension_received_document(
document_parts.append("")
combined_document_string = "\n".join(document_parts)
+
+ # Generate unique identifier hash for this extension document (using URL)
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.EXTENSION, content.metadata.VisitedWebPageURL, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(combined_document_string, search_space_id)
- # Check if document with this content hash already exists
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- await task_logger.log_task_success(
- log_entry,
- f"Extension document already exists: {content.metadata.VisitedWebPageTitle}",
- {
- "duplicate_detected": True,
- "existing_document_id": existing_document.id,
- },
- )
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
- # Get user's long context LLM
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Extension document unchanged: {content.metadata.VisitedWebPageTitle}",
+ {
+ "duplicate_detected": True,
+ "existing_document_id": existing_document.id,
+ },
+ )
+ logging.info(
+ f"Document for URL {content.metadata.VisitedWebPageURL} unchanged. Skipping."
+ )
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
+ )
+
+ # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
@@ -127,21 +145,36 @@ async def add_extension_received_document(
# Process chunks
chunks = await create_document_chunks(content.pageContent)
- # Create and store document
- document = Document(
- search_space_id=search_space_id,
- title=content.metadata.VisitedWebPageTitle,
- document_type=DocumentType.EXTENSION,
- document_metadata=content.metadata.model_dump(),
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ existing_document.title = content.metadata.VisitedWebPageTitle
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = content.metadata.model_dump()
+ existing_document.chunks = chunks
- session.add(document)
- await session.commit()
- await session.refresh(document)
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ document = Document(
+ search_space_id=search_space_id,
+ title=content.metadata.VisitedWebPageTitle,
+ document_type=DocumentType.EXTENSION,
+ document_metadata=content.metadata.model_dump(),
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
# Log success
await task_logger.log_task_success(
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 573b2c28c..f509e700b 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -15,10 +15,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document,
+ check_document_by_unique_identifier,
)
@@ -47,19 +48,31 @@ async def add_received_file_document_using_unstructured(
unstructured_processed_elements
)
+ # Generate unique identifier hash for this file
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, file_name, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id)
- # Check if document with this content hash already exists
- existing_document = await check_duplicate_document(session, content_hash)
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
if existing_document:
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logging.info(f"Document for file {file_name} unchanged. Skipping.")
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for file {file_name}. Updating document."
+ )
- # TODO: Check if file_markdown exceeds token limit of embedding model
-
- # Get user's long context LLM
+ # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
@@ -79,24 +92,42 @@ async def add_received_file_document_using_unstructured(
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
- # Create and store document
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=DocumentType.FILE,
- document_metadata={
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ existing_document.title = file_name
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
"FILE_NAME": file_name,
"ETL_SERVICE": "UNSTRUCTURED",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ }
+ existing_document.chunks = chunks
- session.add(document)
- await session.commit()
- await session.refresh(document)
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=DocumentType.FILE,
+ document_metadata={
+ "FILE_NAME": file_name,
+ "ETL_SERVICE": "UNSTRUCTURED",
+ },
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
return document
except SQLAlchemyError as db_error:
@@ -131,17 +162,31 @@ async def add_received_file_document_using_llamacloud(
# Combine all markdown documents into one
file_in_markdown = llamacloud_markdown_document
+ # Generate unique identifier hash for this file
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, file_name, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id)
- # Check if document with this content hash already exists
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
- # Get user's long context LLM
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logging.info(f"Document for file {file_name} unchanged. Skipping.")
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for file {file_name}. Updating document."
+ )
+
+ # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
@@ -161,24 +206,42 @@ async def add_received_file_document_using_llamacloud(
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
- # Create and store document
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=DocumentType.FILE,
- document_metadata={
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ existing_document.title = file_name
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
"FILE_NAME": file_name,
"ETL_SERVICE": "LLAMACLOUD",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ }
+ existing_document.chunks = chunks
- session.add(document)
- await session.commit()
- await session.refresh(document)
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=DocumentType.FILE,
+ document_metadata={
+ "FILE_NAME": file_name,
+ "ETL_SERVICE": "LLAMACLOUD",
+ },
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
return document
except SQLAlchemyError as db_error:
@@ -214,17 +277,31 @@ async def add_received_file_document_using_docling(
try:
file_in_markdown = docling_markdown_document
+ # Generate unique identifier hash for this file
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, file_name, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id)
- # Check if document with this content hash already exists
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
- # Get user's long context LLM
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ logging.info(f"Document for file {file_name} unchanged. Skipping.")
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for file {file_name}. Updating document."
+ )
+
+ # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
@@ -268,20 +345,38 @@ async def add_received_file_document_using_docling(
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
- # Create and store document
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=DocumentType.FILE,
- document_metadata={
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ existing_document.title = file_name
+ existing_document.content = enhanced_summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
"FILE_NAME": file_name,
"ETL_SERVICE": "DOCLING",
- },
- content=enhanced_summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ }
+ existing_document.chunks = chunks
+
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=DocumentType.FILE,
+ document_metadata={
+ "FILE_NAME": file_name,
+ "ETL_SERVICE": "DOCLING",
+ },
+ content=enhanced_summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
session.add(document)
await session.commit()
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index fa3c79d81..76215ed51 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -14,10 +14,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document,
+ check_document_by_unique_identifier,
)
@@ -56,25 +57,41 @@ async def add_received_markdown_file_document(
)
try:
+ # Generate unique identifier hash for this markdown file
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, file_name, search_space_id
+ )
+
+ # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id)
- # Check if document with this content hash already exists
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- await task_logger.log_task_success(
- log_entry,
- f"Markdown file document already exists: {file_name}",
- {
- "duplicate_detected": True,
- "existing_document_id": existing_document.id,
- },
- )
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Check if document with this unique identifier already exists
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
- # Get user's long context LLM
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Markdown file document unchanged: {file_name}",
+ {
+ "duplicate_detected": True,
+ "existing_document_id": existing_document.id,
+ },
+ )
+ logging.info(
+ f"Document for markdown file {file_name} unchanged. Skipping."
+ )
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for markdown file {file_name}. Updating document."
+ )
+
+ # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
@@ -93,23 +110,40 @@ async def add_received_markdown_file_document(
# Process chunks
chunks = await create_document_chunks(file_in_markdown)
- # Create and store document
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=DocumentType.FILE,
- document_metadata={
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ existing_document.title = file_name
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
"FILE_NAME": file_name,
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ }
+ existing_document.chunks = chunks
- session.add(document)
- await session.commit()
- await session.refresh(document)
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=DocumentType.FILE,
+ document_metadata={
+ "FILE_NAME": file_name,
+ },
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
# Log success
await task_logger.log_task_success(
diff --git a/surfsense_backend/app/tasks/document_processors/url_crawler.py b/surfsense_backend/app/tasks/document_processors/url_crawler.py
index 682086112..8e2863198 100644
--- a/surfsense_backend/app/tasks/document_processors/url_crawler.py
+++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py
@@ -17,10 +17,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document,
+ check_document_by_unique_identifier,
md,
)
@@ -129,31 +130,49 @@ async def add_crawled_url_document(
document_parts.append("")
combined_document_string = "\n".join(document_parts)
- content_hash = generate_content_hash(combined_document_string, search_space_id)
- # Check for duplicates
- await task_logger.log_task_progress(
- log_entry,
- f"Checking for duplicate content: {url}",
- {"stage": "duplicate_check", "content_hash": content_hash},
+ # Generate unique identifier hash for this URL
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.CRAWLED_URL, url, search_space_id
)
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists for URL: {url}",
- {
- "duplicate_detected": True,
- "existing_document_id": existing_document.id,
- },
- )
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Generate content hash
+ content_hash = generate_content_hash(combined_document_string, search_space_id)
- # Get LLM for summary generation
+ # Check if document with this unique identifier already exists
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Checking for existing URL: {url}",
+ {"stage": "duplicate_check", "url": url},
+ )
+
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ await task_logger.log_task_success(
+ log_entry,
+ f"URL document unchanged: {url}",
+ {
+ "duplicate_detected": True,
+ "existing_document_id": existing_document.id,
+ },
+ )
+ logging.info(f"Document for URL {url} unchanged. Skipping.")
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(f"Content changed for URL {url}. Updating document.")
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Updating URL document: {url}",
+ {"stage": "document_update", "url": url},
+ )
+
+ # Get LLM for summary generation (needed for both create and update)
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {url}",
@@ -194,27 +213,50 @@ async def add_crawled_url_document(
chunks = await create_document_chunks(content_in_markdown)
- # Create and store document
- await task_logger.log_task_progress(
- log_entry,
- f"Creating document in database for URL: {url}",
- {"stage": "document_creation", "chunks_count": len(chunks)},
- )
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Updating document in database for URL: {url}",
+ {"stage": "document_update", "chunks_count": len(chunks)},
+ )
- document = Document(
- search_space_id=search_space_id,
- title=url_crawled[0].metadata["title"]
- if isinstance(crawl_loader, FireCrawlLoader)
- else url_crawled[0].metadata["source"],
- document_type=DocumentType.CRAWLED_URL,
- document_metadata=url_crawled[0].metadata,
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- )
+ existing_document.title = (
+ url_crawled[0].metadata["title"]
+ if isinstance(crawl_loader, FireCrawlLoader)
+ else url_crawled[0].metadata["source"]
+ )
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = url_crawled[0].metadata
+ existing_document.chunks = chunks
- session.add(document)
+ document = existing_document
+ else:
+ # Create new document
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Creating document in database for URL: {url}",
+ {"stage": "document_creation", "chunks_count": len(chunks)},
+ )
+
+ document = Document(
+ search_space_id=search_space_id,
+ title=url_crawled[0].metadata["title"]
+ if isinstance(crawl_loader, FireCrawlLoader)
+ else url_crawled[0].metadata["source"],
+ document_type=DocumentType.CRAWLED_URL,
+ document_metadata=url_crawled[0].metadata,
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
await session.commit()
await session.refresh(document)
diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
index a28a7f186..c7d396974 100644
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@@ -17,10 +17,11 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
+ generate_unique_identifier_hash,
)
from .base import (
- check_duplicate_document,
+ check_document_by_unique_identifier,
)
@@ -201,32 +202,54 @@ async def add_youtube_video_document(
document_parts.append("")
combined_document_string = "\n".join(document_parts)
- content_hash = generate_content_hash(combined_document_string, search_space_id)
- # Check for duplicates
- await task_logger.log_task_progress(
- log_entry,
- f"Checking for duplicate video content: {video_id}",
- {"stage": "duplicate_check", "content_hash": content_hash},
+ # Generate unique identifier hash for this YouTube video
+ unique_identifier_hash = generate_unique_identifier_hash(
+ DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
)
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- await task_logger.log_task_success(
- log_entry,
- f"YouTube video document already exists: {video_data.get('title', 'YouTube Video')}",
- {
- "duplicate_detected": True,
- "existing_document_id": existing_document.id,
- "video_id": video_id,
- },
- )
- logging.info(
- f"Document with content hash {content_hash} already exists. Skipping processing."
- )
- return existing_document
+ # Generate content hash
+ content_hash = generate_content_hash(combined_document_string, search_space_id)
- # Get LLM for summary generation
+ # Check if document with this unique identifier already exists
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Checking for existing video: {video_id}",
+ {"stage": "duplicate_check", "video_id": video_id},
+ )
+
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ if existing_document:
+ # Document exists - check if content has changed
+ if existing_document.content_hash == content_hash:
+ await task_logger.log_task_success(
+ log_entry,
+ f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
+ {
+ "duplicate_detected": True,
+ "existing_document_id": existing_document.id,
+ "video_id": video_id,
+ },
+ )
+ logging.info(
+ f"Document for YouTube video {video_id} unchanged. Skipping."
+ )
+ return existing_document
+ else:
+ # Content has changed - update the existing document
+ logging.info(
+ f"Content changed for YouTube video {video_id}. Updating document."
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}",
+ {"stage": "document_update", "video_id": video_id},
+ )
+
+ # Get LLM for summary generation (needed for both create and update)
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
@@ -270,33 +293,60 @@ async def add_youtube_video_document(
chunks = await create_document_chunks(combined_document_string)
- # Create document
- await task_logger.log_task_progress(
- log_entry,
- f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
- {"stage": "document_creation", "chunks_count": len(chunks)},
- )
+ # Update or create document
+ if existing_document:
+ # Update existing document
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
+ {"stage": "document_update", "chunks_count": len(chunks)},
+ )
- document = Document(
- title=video_data.get("title", "YouTube Video"),
- document_type=DocumentType.YOUTUBE_VIDEO,
- document_metadata={
+ existing_document.title = video_data.get("title", "YouTube Video")
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = {
"url": url,
"video_id": video_id,
"video_title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- search_space_id=search_space_id,
- content_hash=content_hash,
- )
+ }
+ existing_document.chunks = chunks
- session.add(document)
- await session.commit()
- await session.refresh(document)
+ await session.commit()
+ await session.refresh(existing_document)
+ document = existing_document
+ else:
+ # Create new document
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
+ {"stage": "document_creation", "chunks_count": len(chunks)},
+ )
+
+ document = Document(
+ title=video_data.get("title", "YouTube Video"),
+ document_type=DocumentType.YOUTUBE_VIDEO,
+ document_metadata={
+ "url": url,
+ "video_id": video_id,
+ "video_title": video_data.get("title", "YouTube Video"),
+ "author": video_data.get("author_name", "Unknown"),
+ "thumbnail": video_data.get("thumbnail_url", ""),
+ },
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ search_space_id=search_space_id,
+ content_hash=content_hash,
+ unique_identifier_hash=unique_identifier_hash,
+ )
+
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
# Log success
await task_logger.log_task_success(
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index ead6a89e7..dd1ae4ce5 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -20,6 +20,7 @@ async def stream_connector_search_results(
langchain_chat_history: list[Any],
search_mode_str: str,
document_ids_to_add_in_context: list[int],
+ language: str | None = None,
) -> AsyncGenerator[str, None]:
"""
Stream connector search results to the client
@@ -66,8 +67,10 @@ async def stream_connector_search_results(
"search_mode": search_mode,
"research_mode": research_mode,
"document_ids_to_add_in_context": document_ids_to_add_in_context,
+ "language": language, # Add language to the configuration
}
}
+ # print(f"Researcher configuration: {config['configurable']}") # Debug print
# Initialize state with database session and streaming service
initial_state = State(
db_session=session,
diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py
index 69ba27c7b..9883a74ed 100644
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@@ -3,7 +3,7 @@ import hashlib
from litellm import get_model_info, token_counter
from app.config import config
-from app.db import Chunk
+from app.db import Chunk, DocumentType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
@@ -308,3 +308,40 @@ def generate_content_hash(content: str, search_space_id: int) -> str:
"""Generate SHA-256 hash for the given content combined with search space ID."""
combined_data = f"{search_space_id}:{content}"
return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
+
+
+def generate_unique_identifier_hash(
+ document_type: DocumentType,
+ unique_identifier: str | int | float,
+ search_space_id: int,
+) -> str:
+ """
+ Generate SHA-256 hash for a unique document identifier from connector sources.
+
+ This function creates a consistent hash based on the document type, its unique
+ identifier from the source system, and the search space ID. This helps prevent
+ duplicate documents when syncing from various connectors like Slack, Notion, Jira, etc.
+
+ Args:
+ document_type: The type of document (e.g., SLACK_CONNECTOR, NOTION_CONNECTOR)
+ unique_identifier: The unique ID from the source system (e.g., message ID, page ID)
+ search_space_id: The search space this document belongs to
+
+ Returns:
+ str: SHA-256 hash string representing the unique document identifier
+
+ Example:
+ >>> generate_unique_identifier_hash(
+ ... DocumentType.SLACK_CONNECTOR,
+ ... "1234567890.123456",
+ ... 42
+ ... )
+ 'a1b2c3d4e5f6...'
+ """
+ # Convert unique_identifier to string to handle different types
+ identifier_str = str(unique_identifier)
+
+ # Combine document type value, unique identifier, and search space ID
+ combined_data = f"{document_type.value}:{identifier_str}:{search_space_id}"
+
+ return hashlib.sha256(combined_data.encode("utf-8")).hexdigest()
diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py
index 437d23b55..c23ca8543 100644
--- a/surfsense_backend/app/utils/validators.py
+++ b/surfsense_backend/app/utils/validators.py
@@ -295,13 +295,8 @@ def validate_messages(messages: Any) -> list[dict]:
status_code=400, detail=f"messages[{i}].content cannot be empty"
)
- # Trim content and enforce max length (10,000 chars)
+ # Trim content
sanitized_content = content.strip()
- if len(sanitized_content) > 10000: # Reasonable limit
- raise HTTPException(
- status_code=400,
- detail=f"messages[{i}].content is too long (max 10000 characters)",
- )
validated_messages.append({"role": role, "content": sanitized_content})
@@ -412,7 +407,7 @@ def validate_connector_config(
raise ValueError(f"Invalid email format for {connector_name} connector")
def validate_url_field(key: str, connector_name: str) -> None:
- if not validators.url(config.get(key, "")):
+ if not validators.url(config.get(key, "").strip(), simple_host=True):
raise ValueError(f"Invalid base URL format for {connector_name} connector")
def validate_list_field(key: str, field_name: str) -> None:
@@ -424,6 +419,20 @@ def validate_connector_config(
connector_rules = {
"SERPER_API": {"required": ["SERPER_API_KEY"], "validators": {}},
"TAVILY_API": {"required": ["TAVILY_API_KEY"], "validators": {}},
+ "SEARXNG_API": {
+ "required": ["SEARXNG_HOST"],
+ "optional": [
+ "SEARXNG_API_KEY",
+ "SEARXNG_ENGINES",
+ "SEARXNG_CATEGORIES",
+ "SEARXNG_LANGUAGE",
+ "SEARXNG_SAFESEARCH",
+ "SEARXNG_VERIFY_SSL",
+ ],
+ "validators": {
+ "SEARXNG_HOST": lambda: validate_url_field("SEARXNG_HOST", "SearxNG")
+ },
+ },
"LINKUP_API": {"required": ["LINKUP_API_KEY"], "validators": {}},
"SLACK_CONNECTOR": {"required": ["SLACK_BOT_TOKEN"], "validators": {}},
"NOTION_CONNECTOR": {
@@ -484,10 +493,21 @@ def validate_connector_config(
if not rules:
return config # Unknown connector type, pass through
- # Validate required keys match exactly
- if set(config.keys()) != set(rules["required"]):
+ required_keys = set(rules["required"])
+ optional_keys = set(rules.get("optional", []))
+ config_keys = set(config.keys())
+
+ # Validate that no unexpected keys are present
+ if not config_keys.issubset(required_keys | optional_keys):
+ allowed_keys = list(required_keys | optional_keys)
raise ValueError(
- f"For {connector_type_str} connector type, config must only contain these keys: {rules['required']}"
+ f"For {connector_type_str} connector type, config may only contain these keys: {allowed_keys}"
+ )
+
+ # Validate that all required keys are present
+ if not required_keys.issubset(config_keys):
+ raise ValueError(
+ f"For {connector_type_str} connector type, config must include these keys: {sorted(required_keys)}"
)
# Apply custom validators first (these check format before emptiness)
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index aa109492d..ea23407ba 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
"litellm>=1.77.5",
"langchain-litellm>=0.2.3",
"elasticsearch>=9.1.1",
+ "faster-whisper>=1.1.0",
]
[dependency-groups]
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index d1a2ee154..01a527b6d 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -257,38 +257,38 @@ version = "0.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/dd/3b/69ff8a885e4c1c42014c2765275c4bd91fe7bc9847e9d8543dbcbb09f820/audioop_lts-0.2.1.tar.gz", hash = "sha256:e81268da0baa880431b68b1308ab7257eb33f356e57a5f9b1f915dfb13dd1387", size = 30204, upload-time = "2024-08-04T21:14:43.957Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/01/91/a219253cc6e92db2ebeaf5cf8197f71d995df6f6b16091d1f3ce62cb169d/audioop_lts-0.2.1-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd1345ae99e17e6910f47ce7d52673c6a1a70820d78b67de1b7abb3af29c426a", size = 46252, upload-time = "2024-08-04T21:13:56.209Z" },
- { url = "https://files.pythonhosted.org/packages/ec/f6/3cb21e0accd9e112d27cee3b1477cd04dafe88675c54ad8b0d56226c1e0b/audioop_lts-0.2.1-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:e175350da05d2087e12cea8e72a70a1a8b14a17e92ed2022952a4419689ede5e", size = 27183, upload-time = "2024-08-04T21:13:59.966Z" },
- { url = "https://files.pythonhosted.org/packages/ea/7e/f94c8a6a8b2571694375b4cf94d3e5e0f529e8e6ba280fad4d8c70621f27/audioop_lts-0.2.1-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:4a8dd6a81770f6ecf019c4b6d659e000dc26571b273953cef7cd1d5ce2ff3ae6", size = 26726, upload-time = "2024-08-04T21:14:00.846Z" },
- { url = "https://files.pythonhosted.org/packages/ef/f8/a0e8e7a033b03fae2b16bc5aa48100b461c4f3a8a38af56d5ad579924a3a/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1cd3c0b6f2ca25c7d2b1c3adeecbe23e65689839ba73331ebc7d893fcda7ffe", size = 80718, upload-time = "2024-08-04T21:14:01.989Z" },
- { url = "https://files.pythonhosted.org/packages/8f/ea/a98ebd4ed631c93b8b8f2368862cd8084d75c77a697248c24437c36a6f7e/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff3f97b3372c97782e9c6d3d7fdbe83bce8f70de719605bd7ee1839cd1ab360a", size = 88326, upload-time = "2024-08-04T21:14:03.509Z" },
- { url = "https://files.pythonhosted.org/packages/33/79/e97a9f9daac0982aa92db1199339bd393594d9a4196ad95ae088635a105f/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a351af79edefc2a1bd2234bfd8b339935f389209943043913a919df4b0f13300", size = 80539, upload-time = "2024-08-04T21:14:04.679Z" },
- { url = "https://files.pythonhosted.org/packages/b2/d3/1051d80e6f2d6f4773f90c07e73743a1e19fcd31af58ff4e8ef0375d3a80/audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aeb6f96f7f6da80354330470b9134d81b4cf544cdd1c549f2f45fe964d28059", size = 78577, upload-time = "2024-08-04T21:14:09.038Z" },
- { url = "https://files.pythonhosted.org/packages/7a/1d/54f4c58bae8dc8c64a75071c7e98e105ddaca35449376fcb0180f6e3c9df/audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c589f06407e8340e81962575fcffbba1e92671879a221186c3d4662de9fe804e", size = 82074, upload-time = "2024-08-04T21:14:09.99Z" },
- { url = "https://files.pythonhosted.org/packages/36/89/2e78daa7cebbea57e72c0e1927413be4db675548a537cfba6a19040d52fa/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fbae5d6925d7c26e712f0beda5ed69ebb40e14212c185d129b8dfbfcc335eb48", size = 84210, upload-time = "2024-08-04T21:14:11.468Z" },
- { url = "https://files.pythonhosted.org/packages/a5/57/3ff8a74df2ec2fa6d2ae06ac86e4a27d6412dbb7d0e0d41024222744c7e0/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_i686.whl", hash = "sha256:d2d5434717f33117f29b5691fbdf142d36573d751716249a288fbb96ba26a281", size = 85664, upload-time = "2024-08-04T21:14:12.394Z" },
- { url = "https://files.pythonhosted.org/packages/16/01/21cc4e5878f6edbc8e54be4c108d7cb9cb6202313cfe98e4ece6064580dd/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_ppc64le.whl", hash = "sha256:f626a01c0a186b08f7ff61431c01c055961ee28769591efa8800beadd27a2959", size = 93255, upload-time = "2024-08-04T21:14:13.707Z" },
- { url = "https://files.pythonhosted.org/packages/3e/28/7f7418c362a899ac3b0bf13b1fde2d4ffccfdeb6a859abd26f2d142a1d58/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_s390x.whl", hash = "sha256:05da64e73837f88ee5c6217d732d2584cf638003ac72df124740460531e95e47", size = 87760, upload-time = "2024-08-04T21:14:14.74Z" },
- { url = "https://files.pythonhosted.org/packages/6d/d8/577a8be87dc7dd2ba568895045cee7d32e81d85a7e44a29000fe02c4d9d4/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:56b7a0a4dba8e353436f31a932f3045d108a67b5943b30f85a5563f4d8488d77", size = 84992, upload-time = "2024-08-04T21:14:19.155Z" },
- { url = "https://files.pythonhosted.org/packages/ef/9a/4699b0c4fcf89936d2bfb5425f55f1a8b86dff4237cfcc104946c9cd9858/audioop_lts-0.2.1-cp313-abi3-win32.whl", hash = "sha256:6e899eb8874dc2413b11926b5fb3857ec0ab55222840e38016a6ba2ea9b7d5e3", size = 26059, upload-time = "2024-08-04T21:14:20.438Z" },
- { url = "https://files.pythonhosted.org/packages/3a/1c/1f88e9c5dd4785a547ce5fd1eb83fff832c00cc0e15c04c1119b02582d06/audioop_lts-0.2.1-cp313-abi3-win_amd64.whl", hash = "sha256:64562c5c771fb0a8b6262829b9b4f37a7b886c01b4d3ecdbae1d629717db08b4", size = 30412, upload-time = "2024-08-04T21:14:21.342Z" },
- { url = "https://files.pythonhosted.org/packages/c4/e9/c123fd29d89a6402ad261516f848437472ccc602abb59bba522af45e281b/audioop_lts-0.2.1-cp313-abi3-win_arm64.whl", hash = "sha256:c45317debeb64002e980077642afbd977773a25fa3dfd7ed0c84dccfc1fafcb0", size = 23578, upload-time = "2024-08-04T21:14:22.193Z" },
- { url = "https://files.pythonhosted.org/packages/7a/99/bb664a99561fd4266687e5cb8965e6ec31ba4ff7002c3fce3dc5ef2709db/audioop_lts-0.2.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:3827e3fce6fee4d69d96a3d00cd2ab07f3c0d844cb1e44e26f719b34a5b15455", size = 46827, upload-time = "2024-08-04T21:14:23.034Z" },
- { url = "https://files.pythonhosted.org/packages/c4/e3/f664171e867e0768ab982715e744430cf323f1282eb2e11ebfb6ee4c4551/audioop_lts-0.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:161249db9343b3c9780ca92c0be0d1ccbfecdbccac6844f3d0d44b9c4a00a17f", size = 27479, upload-time = "2024-08-04T21:14:23.922Z" },
- { url = "https://files.pythonhosted.org/packages/a6/0d/2a79231ff54eb20e83b47e7610462ad6a2bea4e113fae5aa91c6547e7764/audioop_lts-0.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5b7b4ff9de7a44e0ad2618afdc2ac920b91f4a6d3509520ee65339d4acde5abf", size = 27056, upload-time = "2024-08-04T21:14:28.061Z" },
- { url = "https://files.pythonhosted.org/packages/86/46/342471398283bb0634f5a6df947806a423ba74b2e29e250c7ec0e3720e4f/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72e37f416adb43b0ced93419de0122b42753ee74e87070777b53c5d2241e7fab", size = 87802, upload-time = "2024-08-04T21:14:29.586Z" },
- { url = "https://files.pythonhosted.org/packages/56/44/7a85b08d4ed55517634ff19ddfbd0af05bf8bfd39a204e4445cd0e6f0cc9/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:534ce808e6bab6adb65548723c8cbe189a3379245db89b9d555c4210b4aaa9b6", size = 95016, upload-time = "2024-08-04T21:14:30.481Z" },
- { url = "https://files.pythonhosted.org/packages/a8/2a/45edbca97ea9ee9e6bbbdb8d25613a36e16a4d1e14ae01557392f15cc8d3/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d2de9b6fb8b1cf9f03990b299a9112bfdf8b86b6987003ca9e8a6c4f56d39543", size = 87394, upload-time = "2024-08-04T21:14:31.883Z" },
- { url = "https://files.pythonhosted.org/packages/14/ae/832bcbbef2c510629593bf46739374174606e25ac7d106b08d396b74c964/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f24865991b5ed4b038add5edbf424639d1358144f4e2a3e7a84bc6ba23e35074", size = 84874, upload-time = "2024-08-04T21:14:32.751Z" },
- { url = "https://files.pythonhosted.org/packages/26/1c/8023c3490798ed2f90dfe58ec3b26d7520a243ae9c0fc751ed3c9d8dbb69/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bdb3b7912ccd57ea53197943f1bbc67262dcf29802c4a6df79ec1c715d45a78", size = 88698, upload-time = "2024-08-04T21:14:34.147Z" },
- { url = "https://files.pythonhosted.org/packages/2c/db/5379d953d4918278b1f04a5a64b2c112bd7aae8f81021009da0dcb77173c/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:120678b208cca1158f0a12d667af592e067f7a50df9adc4dc8f6ad8d065a93fb", size = 90401, upload-time = "2024-08-04T21:14:35.276Z" },
- { url = "https://files.pythonhosted.org/packages/99/6e/3c45d316705ab1aec2e69543a5b5e458d0d112a93d08994347fafef03d50/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:54cd4520fc830b23c7d223693ed3e1b4d464997dd3abc7c15dce9a1f9bd76ab2", size = 91864, upload-time = "2024-08-04T21:14:36.158Z" },
- { url = "https://files.pythonhosted.org/packages/08/58/6a371d8fed4f34debdb532c0b00942a84ebf3e7ad368e5edc26931d0e251/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:d6bd20c7a10abcb0fb3d8aaa7508c0bf3d40dfad7515c572014da4b979d3310a", size = 98796, upload-time = "2024-08-04T21:14:37.185Z" },
- { url = "https://files.pythonhosted.org/packages/ee/77/d637aa35497e0034ff846fd3330d1db26bc6fd9dd79c406e1341188b06a2/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:f0ed1ad9bd862539ea875fb339ecb18fcc4148f8d9908f4502df28f94d23491a", size = 94116, upload-time = "2024-08-04T21:14:38.145Z" },
- { url = "https://files.pythonhosted.org/packages/1a/60/7afc2abf46bbcf525a6ebc0305d85ab08dc2d1e2da72c48dbb35eee5b62c/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e1af3ff32b8c38a7d900382646e91f2fc515fd19dea37e9392275a5cbfdbff63", size = 91520, upload-time = "2024-08-04T21:14:39.128Z" },
- { url = "https://files.pythonhosted.org/packages/65/6d/42d40da100be1afb661fd77c2b1c0dfab08af1540df57533621aea3db52a/audioop_lts-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:f51bb55122a89f7a0817d7ac2319744b4640b5b446c4c3efcea5764ea99ae509", size = 26482, upload-time = "2024-08-04T21:14:40.269Z" },
- { url = "https://files.pythonhosted.org/packages/01/09/f08494dca79f65212f5b273aecc5a2f96691bf3307cac29acfcf84300c01/audioop_lts-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f0f2f336aa2aee2bce0b0dcc32bbba9178995454c7b979cf6ce086a8801e14c7", size = 30780, upload-time = "2024-08-04T21:14:41.128Z" },
- { url = "https://files.pythonhosted.org/packages/5d/35/be73b6015511aa0173ec595fc579133b797ad532996f2998fd6b8d1bbe6b/audioop_lts-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:78bfb3703388c780edf900be66e07de5a3d4105ca8e8720c5c4d67927e0b15d0", size = 23918, upload-time = "2024-08-04T21:14:42.803Z" },
+ { url = "https://files.pythonhosted.org/packages/01/91/a219253cc6e92db2ebeaf5cf8197f71d995df6f6b16091d1f3ce62cb169d/audioop_lts-0.2.1-cp313-abi3-macosx_10_13_universal2.whl", hash = "sha256:fd1345ae99e17e6910f47ce7d52673c6a1a70820d78b67de1b7abb3af29c426a", size = 46252 },
+ { url = "https://files.pythonhosted.org/packages/ec/f6/3cb21e0accd9e112d27cee3b1477cd04dafe88675c54ad8b0d56226c1e0b/audioop_lts-0.2.1-cp313-abi3-macosx_10_13_x86_64.whl", hash = "sha256:e175350da05d2087e12cea8e72a70a1a8b14a17e92ed2022952a4419689ede5e", size = 27183 },
+ { url = "https://files.pythonhosted.org/packages/ea/7e/f94c8a6a8b2571694375b4cf94d3e5e0f529e8e6ba280fad4d8c70621f27/audioop_lts-0.2.1-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:4a8dd6a81770f6ecf019c4b6d659e000dc26571b273953cef7cd1d5ce2ff3ae6", size = 26726 },
+ { url = "https://files.pythonhosted.org/packages/ef/f8/a0e8e7a033b03fae2b16bc5aa48100b461c4f3a8a38af56d5ad579924a3a/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1cd3c0b6f2ca25c7d2b1c3adeecbe23e65689839ba73331ebc7d893fcda7ffe", size = 80718 },
+ { url = "https://files.pythonhosted.org/packages/8f/ea/a98ebd4ed631c93b8b8f2368862cd8084d75c77a697248c24437c36a6f7e/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff3f97b3372c97782e9c6d3d7fdbe83bce8f70de719605bd7ee1839cd1ab360a", size = 88326 },
+ { url = "https://files.pythonhosted.org/packages/33/79/e97a9f9daac0982aa92db1199339bd393594d9a4196ad95ae088635a105f/audioop_lts-0.2.1-cp313-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a351af79edefc2a1bd2234bfd8b339935f389209943043913a919df4b0f13300", size = 80539 },
+ { url = "https://files.pythonhosted.org/packages/b2/d3/1051d80e6f2d6f4773f90c07e73743a1e19fcd31af58ff4e8ef0375d3a80/audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2aeb6f96f7f6da80354330470b9134d81b4cf544cdd1c549f2f45fe964d28059", size = 78577 },
+ { url = "https://files.pythonhosted.org/packages/7a/1d/54f4c58bae8dc8c64a75071c7e98e105ddaca35449376fcb0180f6e3c9df/audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c589f06407e8340e81962575fcffbba1e92671879a221186c3d4662de9fe804e", size = 82074 },
+ { url = "https://files.pythonhosted.org/packages/36/89/2e78daa7cebbea57e72c0e1927413be4db675548a537cfba6a19040d52fa/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fbae5d6925d7c26e712f0beda5ed69ebb40e14212c185d129b8dfbfcc335eb48", size = 84210 },
+ { url = "https://files.pythonhosted.org/packages/a5/57/3ff8a74df2ec2fa6d2ae06ac86e4a27d6412dbb7d0e0d41024222744c7e0/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_i686.whl", hash = "sha256:d2d5434717f33117f29b5691fbdf142d36573d751716249a288fbb96ba26a281", size = 85664 },
+ { url = "https://files.pythonhosted.org/packages/16/01/21cc4e5878f6edbc8e54be4c108d7cb9cb6202313cfe98e4ece6064580dd/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_ppc64le.whl", hash = "sha256:f626a01c0a186b08f7ff61431c01c055961ee28769591efa8800beadd27a2959", size = 93255 },
+ { url = "https://files.pythonhosted.org/packages/3e/28/7f7418c362a899ac3b0bf13b1fde2d4ffccfdeb6a859abd26f2d142a1d58/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_s390x.whl", hash = "sha256:05da64e73837f88ee5c6217d732d2584cf638003ac72df124740460531e95e47", size = 87760 },
+ { url = "https://files.pythonhosted.org/packages/6d/d8/577a8be87dc7dd2ba568895045cee7d32e81d85a7e44a29000fe02c4d9d4/audioop_lts-0.2.1-cp313-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:56b7a0a4dba8e353436f31a932f3045d108a67b5943b30f85a5563f4d8488d77", size = 84992 },
+ { url = "https://files.pythonhosted.org/packages/ef/9a/4699b0c4fcf89936d2bfb5425f55f1a8b86dff4237cfcc104946c9cd9858/audioop_lts-0.2.1-cp313-abi3-win32.whl", hash = "sha256:6e899eb8874dc2413b11926b5fb3857ec0ab55222840e38016a6ba2ea9b7d5e3", size = 26059 },
+ { url = "https://files.pythonhosted.org/packages/3a/1c/1f88e9c5dd4785a547ce5fd1eb83fff832c00cc0e15c04c1119b02582d06/audioop_lts-0.2.1-cp313-abi3-win_amd64.whl", hash = "sha256:64562c5c771fb0a8b6262829b9b4f37a7b886c01b4d3ecdbae1d629717db08b4", size = 30412 },
+ { url = "https://files.pythonhosted.org/packages/c4/e9/c123fd29d89a6402ad261516f848437472ccc602abb59bba522af45e281b/audioop_lts-0.2.1-cp313-abi3-win_arm64.whl", hash = "sha256:c45317debeb64002e980077642afbd977773a25fa3dfd7ed0c84dccfc1fafcb0", size = 23578 },
+ { url = "https://files.pythonhosted.org/packages/7a/99/bb664a99561fd4266687e5cb8965e6ec31ba4ff7002c3fce3dc5ef2709db/audioop_lts-0.2.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:3827e3fce6fee4d69d96a3d00cd2ab07f3c0d844cb1e44e26f719b34a5b15455", size = 46827 },
+ { url = "https://files.pythonhosted.org/packages/c4/e3/f664171e867e0768ab982715e744430cf323f1282eb2e11ebfb6ee4c4551/audioop_lts-0.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:161249db9343b3c9780ca92c0be0d1ccbfecdbccac6844f3d0d44b9c4a00a17f", size = 27479 },
+ { url = "https://files.pythonhosted.org/packages/a6/0d/2a79231ff54eb20e83b47e7610462ad6a2bea4e113fae5aa91c6547e7764/audioop_lts-0.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5b7b4ff9de7a44e0ad2618afdc2ac920b91f4a6d3509520ee65339d4acde5abf", size = 27056 },
+ { url = "https://files.pythonhosted.org/packages/86/46/342471398283bb0634f5a6df947806a423ba74b2e29e250c7ec0e3720e4f/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72e37f416adb43b0ced93419de0122b42753ee74e87070777b53c5d2241e7fab", size = 87802 },
+ { url = "https://files.pythonhosted.org/packages/56/44/7a85b08d4ed55517634ff19ddfbd0af05bf8bfd39a204e4445cd0e6f0cc9/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:534ce808e6bab6adb65548723c8cbe189a3379245db89b9d555c4210b4aaa9b6", size = 95016 },
+ { url = "https://files.pythonhosted.org/packages/a8/2a/45edbca97ea9ee9e6bbbdb8d25613a36e16a4d1e14ae01557392f15cc8d3/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d2de9b6fb8b1cf9f03990b299a9112bfdf8b86b6987003ca9e8a6c4f56d39543", size = 87394 },
+ { url = "https://files.pythonhosted.org/packages/14/ae/832bcbbef2c510629593bf46739374174606e25ac7d106b08d396b74c964/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f24865991b5ed4b038add5edbf424639d1358144f4e2a3e7a84bc6ba23e35074", size = 84874 },
+ { url = "https://files.pythonhosted.org/packages/26/1c/8023c3490798ed2f90dfe58ec3b26d7520a243ae9c0fc751ed3c9d8dbb69/audioop_lts-0.2.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bdb3b7912ccd57ea53197943f1bbc67262dcf29802c4a6df79ec1c715d45a78", size = 88698 },
+ { url = "https://files.pythonhosted.org/packages/2c/db/5379d953d4918278b1f04a5a64b2c112bd7aae8f81021009da0dcb77173c/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:120678b208cca1158f0a12d667af592e067f7a50df9adc4dc8f6ad8d065a93fb", size = 90401 },
+ { url = "https://files.pythonhosted.org/packages/99/6e/3c45d316705ab1aec2e69543a5b5e458d0d112a93d08994347fafef03d50/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:54cd4520fc830b23c7d223693ed3e1b4d464997dd3abc7c15dce9a1f9bd76ab2", size = 91864 },
+ { url = "https://files.pythonhosted.org/packages/08/58/6a371d8fed4f34debdb532c0b00942a84ebf3e7ad368e5edc26931d0e251/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:d6bd20c7a10abcb0fb3d8aaa7508c0bf3d40dfad7515c572014da4b979d3310a", size = 98796 },
+ { url = "https://files.pythonhosted.org/packages/ee/77/d637aa35497e0034ff846fd3330d1db26bc6fd9dd79c406e1341188b06a2/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:f0ed1ad9bd862539ea875fb339ecb18fcc4148f8d9908f4502df28f94d23491a", size = 94116 },
+ { url = "https://files.pythonhosted.org/packages/1a/60/7afc2abf46bbcf525a6ebc0305d85ab08dc2d1e2da72c48dbb35eee5b62c/audioop_lts-0.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e1af3ff32b8c38a7d900382646e91f2fc515fd19dea37e9392275a5cbfdbff63", size = 91520 },
+ { url = "https://files.pythonhosted.org/packages/65/6d/42d40da100be1afb661fd77c2b1c0dfab08af1540df57533621aea3db52a/audioop_lts-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:f51bb55122a89f7a0817d7ac2319744b4640b5b446c4c3efcea5764ea99ae509", size = 26482 },
+ { url = "https://files.pythonhosted.org/packages/01/09/f08494dca79f65212f5b273aecc5a2f96691bf3307cac29acfcf84300c01/audioop_lts-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f0f2f336aa2aee2bce0b0dcc32bbba9178995454c7b979cf6ce086a8801e14c7", size = 30780 },
+ { url = "https://files.pythonhosted.org/packages/5d/35/be73b6015511aa0173ec595fc579133b797ad532996f2998fd6b8d1bbe6b/audioop_lts-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:78bfb3703388c780edf900be66e07de5a3d4105ca8e8720c5c4d67927e0b15d0", size = 23918 },
]
[[package]]
@@ -773,7 +773,7 @@ dependencies = [
]
sdist = { url = "https://files.pythonhosted.org/packages/53/a6/a8436f8f7b5578461a4e5c0dbc8341fe2596b703704cf0f5acd35953cc85/csvw-1.11.0.tar.gz", hash = "sha256:c156466fab3331861e0cf3cbe0c4538705800bfac98819149cd70ecbe6f152eb", size = 34812, upload-time = "2021-05-06T08:15:15.351Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/55/ae/afb43a6b88c4202d29e4ec7aca76633d8c530140f4f5a32ee762d07c4607/csvw-1.11.0-py2.py3-none-any.whl", hash = "sha256:243825391308f2568593415364868dda5e50f608fc2bb307fbd79d534af52fd5", size = 35198, upload-time = "2021-05-06T08:15:19.729Z" },
+ { url = "https://files.pythonhosted.org/packages/55/ae/afb43a6b88c4202d29e4ec7aca76633d8c530140f4f5a32ee762d07c4607/csvw-1.11.0-py2.py3-none-any.whl", hash = "sha256:243825391308f2568593415364868dda5e50f608fc2bb307fbd79d534af52fd5", size = 35198 },
]
[[package]]
@@ -1100,34 +1100,7 @@ dependencies = [
]
sdist = { url = "https://files.pythonhosted.org/packages/0e/c3/12d45167ec36f7f9a5ed80bc2128392b3f6207f760d437287d32a0e43f41/effdet-0.4.1.tar.gz", hash = "sha256:ac5589fd304a5650c201986b2ef5f8e10c111093a71b1c49fa6b8817710812b5", size = 110134, upload-time = "2023-05-21T22:18:01.039Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/9c/13/563119fe0af82aca5a3b89399c435953072c39515c2e818eb82793955c3b/effdet-0.4.1-py3-none-any.whl", hash = "sha256:10889a226228d515c948e3fcf811e64c0d78d7aa94823a300045653b9c284cb7", size = 112513, upload-time = "2023-05-21T22:17:58.47Z" },
-]
-
-[[package]]
-name = "elastic-transport"
-version = "9.1.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "certifi" },
- { name = "urllib3" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ae/1f/2d1a1790df2b75e1e1eb90d8a3fe066a47ef95e34430657447e549cc274c/elastic_transport-9.1.0.tar.gz", hash = "sha256:1590e44a25b0fe208107d5e8d7dea15c070525f3ac9baafbe4cb659cd14f073d", size = 76483, upload-time = "2025-07-24T16:41:31.017Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/ef/5d/dd5a919dd887fe20a91f18faf5b4345ee3a058e483d2aa84cef0f2567e17/elastic_transport-9.1.0-py3-none-any.whl", hash = "sha256:369fa56874c74daae4ea10cbf40636d139f38f42bec0e006b9cd45a168ee7fce", size = 65142, upload-time = "2025-07-24T16:41:29.648Z" },
-]
-
-[[package]]
-name = "elasticsearch"
-version = "9.1.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "elastic-transport" },
- { name = "python-dateutil" },
- { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/59/6a/5eecef6f1ac8005b04714405cb65971d46031bd897e47c29af86e0f87353/elasticsearch-9.1.1.tar.gz", hash = "sha256:be20acda2a97591a9a6cf4981fc398ee6fca3291cf9e7a9e52b6a9f41a46d393", size = 857802, upload-time = "2025-09-12T13:27:38.62Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/cf/4c/c0c95d3d881732a5d1b28e12c9be4dea5953ade71810f94565bd5bd2101a/elasticsearch-9.1.1-py3-none-any.whl", hash = "sha256:2a5c27c57ca3dd3365f665c82c9dcd8666ccfb550d5b07c688c21ec636c104e5", size = 937483, upload-time = "2025-09-12T13:27:34.948Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/13/563119fe0af82aca5a3b89399c435953072c39515c2e818eb82793955c3b/effdet-0.4.1-py3-none-any.whl", hash = "sha256:10889a226228d515c948e3fcf811e64c0d78d7aa94823a300045653b9c284cb7", size = 112513 },
]
[[package]]
@@ -1240,23 +1213,23 @@ version = "1.11.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/48/8f/32664a3245247b13702d13d2657ea534daf64e58a3f72a3a2d10598d6916/fastavro-1.11.1.tar.gz", hash = "sha256:bf6acde5ee633a29fb8dfd6dfea13b164722bc3adc05a0e055df080549c1c2f8", size = 1016250, upload-time = "2025-05-18T04:54:31.413Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/99/58/8e789b0a2f532b22e2d090c20d27c88f26a5faadcba4c445c6958ae566cf/fastavro-1.11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e8bc238f2637cd5d15238adbe8fb8c58d2e6f1870e0fb28d89508584670bae4b", size = 939583, upload-time = "2025-05-18T04:54:59.853Z" },
- { url = "https://files.pythonhosted.org/packages/34/3f/02ed44742b1224fe23c9fc9b9b037fc61769df716c083cf80b59a02b9785/fastavro-1.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b403933081c83fc4d8a012ee64b86e560a024b1280e3711ee74f2abc904886e8", size = 3257734, upload-time = "2025-05-18T04:55:02.366Z" },
- { url = "https://files.pythonhosted.org/packages/cc/bc/9cc8b19eeee9039dd49719f8b4020771e805def262435f823fa8f27ddeea/fastavro-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f6ecb4b5f77aa756d973b7dd1c2fb4e4c95b4832a3c98b059aa96c61870c709", size = 3318218, upload-time = "2025-05-18T04:55:04.352Z" },
- { url = "https://files.pythonhosted.org/packages/39/77/3b73a986606494596b6d3032eadf813a05b59d1623f54384a23de4217d5f/fastavro-1.11.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:059893df63ef823b0231b485c9d43016c7e32850cae7bf69f4e9d46dd41c28f2", size = 3297296, upload-time = "2025-05-18T04:55:06.175Z" },
- { url = "https://files.pythonhosted.org/packages/8e/1c/b69ceef6494bd0df14752b5d8648b159ad52566127bfd575e9f5ecc0c092/fastavro-1.11.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5120ffc9a200699218e01777e695a2f08afb3547ba818184198c757dc39417bd", size = 3438056, upload-time = "2025-05-18T04:55:08.276Z" },
- { url = "https://files.pythonhosted.org/packages/ef/11/5c2d0db3bd0e6407546fabae9e267bb0824eacfeba79e7dd81ad88afa27d/fastavro-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:7bb9d0d2233f33a52908b6ea9b376fe0baf1144bdfdfb3c6ad326e200a8b56b0", size = 442824, upload-time = "2025-05-18T04:55:10.385Z" },
- { url = "https://files.pythonhosted.org/packages/ec/08/8e25b9e87a98f8c96b25e64565fa1a1208c0095bb6a84a5c8a4b925688a5/fastavro-1.11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f963b8ddaf179660e814ab420850c1b4ea33e2ad2de8011549d958b21f77f20a", size = 931520, upload-time = "2025-05-18T04:55:11.614Z" },
- { url = "https://files.pythonhosted.org/packages/02/ee/7cf5561ef94781ed6942cee6b394a5e698080f4247f00f158ee396ec244d/fastavro-1.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0253e5b6a3c9b62fae9fc3abd8184c5b64a833322b6af7d666d3db266ad879b5", size = 3195989, upload-time = "2025-05-18T04:55:13.732Z" },
- { url = "https://files.pythonhosted.org/packages/b3/31/f02f097d79f090e5c5aca8a743010c4e833a257c0efdeb289c68294f7928/fastavro-1.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca637b150e1f4c0e8e564fad40a16bd922bcb7ffd1a6e4836e6084f2c4f4e8db", size = 3239755, upload-time = "2025-05-18T04:55:16.463Z" },
- { url = "https://files.pythonhosted.org/packages/09/4c/46626b4ee4eb8eb5aa7835973c6ba8890cf082ef2daface6071e788d2992/fastavro-1.11.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:76af1709031621828ca6ce7f027f7711fa33ac23e8269e7a5733996ff8d318da", size = 3243788, upload-time = "2025-05-18T04:55:18.544Z" },
- { url = "https://files.pythonhosted.org/packages/a7/6f/8ed42524e9e8dc0554f0f211dd1c6c7a9dde83b95388ddcf7c137e70796f/fastavro-1.11.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8224e6d8d9864d4e55dafbe88920d6a1b8c19cc3006acfac6aa4f494a6af3450", size = 3378330, upload-time = "2025-05-18T04:55:20.887Z" },
- { url = "https://files.pythonhosted.org/packages/b8/51/38cbe243d5facccab40fc43a4c17db264c261be955ce003803d25f0da2c3/fastavro-1.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:cde7ed91b52ff21f0f9f157329760ba7251508ca3e9618af3ffdac986d9faaa2", size = 443115, upload-time = "2025-05-18T04:55:22.107Z" },
- { url = "https://files.pythonhosted.org/packages/d0/57/0d31ed1a49c65ad9f0f0128d9a928972878017781f9d4336f5f60982334c/fastavro-1.11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e5ed1325c1c414dd954e7a2c5074daefe1eceb672b8c727aa030ba327aa00693", size = 1021401, upload-time = "2025-05-18T04:55:23.431Z" },
- { url = "https://files.pythonhosted.org/packages/56/7a/a3f1a75fbfc16b3eff65dc0efcdb92364967923194312b3f8c8fc2cb95be/fastavro-1.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cd3c95baeec37188899824faf44a5ee94dfc4d8667b05b2f867070c7eb174c4", size = 3384349, upload-time = "2025-05-18T04:55:25.575Z" },
- { url = "https://files.pythonhosted.org/packages/be/84/02bceb7518867df84027232a75225db758b9b45f12017c9743f45b73101e/fastavro-1.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e0babcd81acceb4c60110af9efa25d890dbb68f7de880f806dadeb1e70fe413", size = 3240658, upload-time = "2025-05-18T04:55:27.633Z" },
- { url = "https://files.pythonhosted.org/packages/f2/17/508c846c644d39bc432b027112068b8e96e7560468304d4c0757539dd73a/fastavro-1.11.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b2c0cb8063c7208b53b6867983dc6ae7cc80b91116b51d435d2610a5db2fc52f", size = 3372809, upload-time = "2025-05-18T04:55:30.063Z" },
- { url = "https://files.pythonhosted.org/packages/fe/84/9c2917a70ed570ddbfd1d32ac23200c1d011e36c332e59950d2f6d204941/fastavro-1.11.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1bc2824e9969c04ab6263d269a1e0e5d40b9bd16ade6b70c29d6ffbc4f3cc102", size = 3387171, upload-time = "2025-05-18T04:55:32.531Z" },
+ { url = "https://files.pythonhosted.org/packages/99/58/8e789b0a2f532b22e2d090c20d27c88f26a5faadcba4c445c6958ae566cf/fastavro-1.11.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e8bc238f2637cd5d15238adbe8fb8c58d2e6f1870e0fb28d89508584670bae4b", size = 939583 },
+ { url = "https://files.pythonhosted.org/packages/34/3f/02ed44742b1224fe23c9fc9b9b037fc61769df716c083cf80b59a02b9785/fastavro-1.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b403933081c83fc4d8a012ee64b86e560a024b1280e3711ee74f2abc904886e8", size = 3257734 },
+ { url = "https://files.pythonhosted.org/packages/cc/bc/9cc8b19eeee9039dd49719f8b4020771e805def262435f823fa8f27ddeea/fastavro-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f6ecb4b5f77aa756d973b7dd1c2fb4e4c95b4832a3c98b059aa96c61870c709", size = 3318218 },
+ { url = "https://files.pythonhosted.org/packages/39/77/3b73a986606494596b6d3032eadf813a05b59d1623f54384a23de4217d5f/fastavro-1.11.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:059893df63ef823b0231b485c9d43016c7e32850cae7bf69f4e9d46dd41c28f2", size = 3297296 },
+ { url = "https://files.pythonhosted.org/packages/8e/1c/b69ceef6494bd0df14752b5d8648b159ad52566127bfd575e9f5ecc0c092/fastavro-1.11.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5120ffc9a200699218e01777e695a2f08afb3547ba818184198c757dc39417bd", size = 3438056 },
+ { url = "https://files.pythonhosted.org/packages/ef/11/5c2d0db3bd0e6407546fabae9e267bb0824eacfeba79e7dd81ad88afa27d/fastavro-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:7bb9d0d2233f33a52908b6ea9b376fe0baf1144bdfdfb3c6ad326e200a8b56b0", size = 442824 },
+ { url = "https://files.pythonhosted.org/packages/ec/08/8e25b9e87a98f8c96b25e64565fa1a1208c0095bb6a84a5c8a4b925688a5/fastavro-1.11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f963b8ddaf179660e814ab420850c1b4ea33e2ad2de8011549d958b21f77f20a", size = 931520 },
+ { url = "https://files.pythonhosted.org/packages/02/ee/7cf5561ef94781ed6942cee6b394a5e698080f4247f00f158ee396ec244d/fastavro-1.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0253e5b6a3c9b62fae9fc3abd8184c5b64a833322b6af7d666d3db266ad879b5", size = 3195989 },
+ { url = "https://files.pythonhosted.org/packages/b3/31/f02f097d79f090e5c5aca8a743010c4e833a257c0efdeb289c68294f7928/fastavro-1.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca637b150e1f4c0e8e564fad40a16bd922bcb7ffd1a6e4836e6084f2c4f4e8db", size = 3239755 },
+ { url = "https://files.pythonhosted.org/packages/09/4c/46626b4ee4eb8eb5aa7835973c6ba8890cf082ef2daface6071e788d2992/fastavro-1.11.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:76af1709031621828ca6ce7f027f7711fa33ac23e8269e7a5733996ff8d318da", size = 3243788 },
+ { url = "https://files.pythonhosted.org/packages/a7/6f/8ed42524e9e8dc0554f0f211dd1c6c7a9dde83b95388ddcf7c137e70796f/fastavro-1.11.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8224e6d8d9864d4e55dafbe88920d6a1b8c19cc3006acfac6aa4f494a6af3450", size = 3378330 },
+ { url = "https://files.pythonhosted.org/packages/b8/51/38cbe243d5facccab40fc43a4c17db264c261be955ce003803d25f0da2c3/fastavro-1.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:cde7ed91b52ff21f0f9f157329760ba7251508ca3e9618af3ffdac986d9faaa2", size = 443115 },
+ { url = "https://files.pythonhosted.org/packages/d0/57/0d31ed1a49c65ad9f0f0128d9a928972878017781f9d4336f5f60982334c/fastavro-1.11.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e5ed1325c1c414dd954e7a2c5074daefe1eceb672b8c727aa030ba327aa00693", size = 1021401 },
+ { url = "https://files.pythonhosted.org/packages/56/7a/a3f1a75fbfc16b3eff65dc0efcdb92364967923194312b3f8c8fc2cb95be/fastavro-1.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cd3c95baeec37188899824faf44a5ee94dfc4d8667b05b2f867070c7eb174c4", size = 3384349 },
+ { url = "https://files.pythonhosted.org/packages/be/84/02bceb7518867df84027232a75225db758b9b45f12017c9743f45b73101e/fastavro-1.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e0babcd81acceb4c60110af9efa25d890dbb68f7de880f806dadeb1e70fe413", size = 3240658 },
+ { url = "https://files.pythonhosted.org/packages/f2/17/508c846c644d39bc432b027112068b8e96e7560468304d4c0757539dd73a/fastavro-1.11.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b2c0cb8063c7208b53b6867983dc6ae7cc80b91116b51d435d2610a5db2fc52f", size = 3372809 },
+ { url = "https://files.pythonhosted.org/packages/fe/84/9c2917a70ed570ddbfd1d32ac23200c1d011e36c332e59950d2f6d204941/fastavro-1.11.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1bc2824e9969c04ab6263d269a1e0e5d40b9bd16ade6b70c29d6ffbc4f3cc102", size = 3387171 },
]
[[package]]
@@ -5358,6 +5331,7 @@ dependencies = [
{ name = "en-core-web-sm" },
{ name = "fastapi" },
{ name = "fastapi-users", extra = ["oauth", "sqlalchemy"] },
+ { name = "faster-whisper" },
{ name = "firecrawl-py" },
{ name = "github3-py" },
{ name = "google-api-python-client" },
@@ -5406,6 +5380,7 @@ requires-dist = [
{ name = "en-core-web-sm", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" },
{ name = "fastapi", specifier = ">=0.115.8" },
{ name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" },
+ { name = "faster-whisper", specifier = ">=1.1.0" },
{ name = "firecrawl-py", specifier = ">=1.12.0" },
{ name = "github3-py", specifier = "==4.0.1" },
{ name = "google-api-python-client", specifier = ">=2.156.0" },
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
index a8d148cef..22b803f9d 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
@@ -45,6 +45,13 @@ const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.TAVILY_API, "h-6 w-6"),
status: "available",
},
+ {
+ id: "searxng",
+ title: "SearxNG",
+ description: "Use your own SearxNG meta-search instance for web results.",
+ icon: getConnectorIcon(EnumConnectorName.SEARXNG_API, "h-6 w-6"),
+ status: "available",
+ },
{
id: "linkup-api",
title: "Linkup API",
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/searxng/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/searxng/page.tsx
new file mode 100644
index 000000000..4fd406706
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/searxng/page.tsx
@@ -0,0 +1,364 @@
+"use client";
+
+import { zodResolver } from "@hookform/resolvers/zod";
+import { ArrowLeft, Check, Info, Loader2 } from "lucide-react";
+import { motion } from "motion/react";
+import { useParams, useRouter } from "next/navigation";
+import { useState } from "react";
+import { useForm } from "react-hook-form";
+import { toast } from "sonner";
+import * as z from "zod";
+import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert";
+import { Button } from "@/components/ui/button";
+import {
+ Card,
+ CardContent,
+ CardDescription,
+ CardFooter,
+ CardHeader,
+ CardTitle,
+} from "@/components/ui/card";
+import {
+ Form,
+ FormControl,
+ FormDescription,
+ FormField,
+ FormItem,
+ FormLabel,
+ FormMessage,
+} from "@/components/ui/form";
+import { Input } from "@/components/ui/input";
+import { Switch } from "@/components/ui/switch";
+import { EnumConnectorName } from "@/contracts/enums/connector";
+import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
+import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors";
+
+const searxngFormSchema = z.object({
+ name: z.string().min(3, {
+ message: "Connector name must be at least 3 characters.",
+ }),
+ host: z
+ .string({ required_error: "Host is required." })
+ .url({ message: "Enter a valid SearxNG host URL (e.g. https://searxng.example.org)." }),
+ api_key: z.string().optional(),
+ engines: z.string().optional(),
+ categories: z.string().optional(),
+ language: z.string().optional(),
+ safesearch: z
+ .string()
+ .regex(/^[0-2]?$/, { message: "SafeSearch must be 0, 1, or 2." })
+ .optional(),
+ verify_ssl: z.boolean().default(true),
+});
+
+type SearxngFormValues = z.infer;
+
+const parseCommaSeparated = (value?: string | null) => {
+ if (!value) return undefined;
+ const items = value
+ .split(",")
+ .map((item) => item.trim())
+ .filter((item) => item.length > 0);
+ return items.length > 0 ? items : undefined;
+};
+
+export default function SearxngConnectorPage() {
+ const router = useRouter();
+ const params = useParams();
+ const searchSpaceId = params.search_space_id as string;
+ const [isSubmitting, setIsSubmitting] = useState(false);
+ const { createConnector } = useSearchSourceConnectors();
+
+ const form = useForm({
+ resolver: zodResolver(searxngFormSchema),
+ defaultValues: {
+ name: "SearxNG Connector",
+ host: "",
+ api_key: "",
+ engines: "",
+ categories: "",
+ language: "",
+ safesearch: "",
+ verify_ssl: true,
+ },
+ });
+
+ const onSubmit = async (values: SearxngFormValues) => {
+ setIsSubmitting(true);
+ try {
+ const config: Record = {
+ SEARXNG_HOST: values.host.trim(),
+ };
+
+ const apiKey = values.api_key?.trim();
+ if (apiKey) config.SEARXNG_API_KEY = apiKey;
+
+ const engines = parseCommaSeparated(values.engines);
+ if (engines) config.SEARXNG_ENGINES = engines;
+
+ const categories = parseCommaSeparated(values.categories);
+ if (categories) config.SEARXNG_CATEGORIES = categories;
+
+ const language = values.language?.trim();
+ if (language) config.SEARXNG_LANGUAGE = language;
+
+ const safesearch = values.safesearch?.trim();
+ if (safesearch) {
+ const parsed = Number(safesearch);
+ if (!Number.isNaN(parsed)) {
+ config.SEARXNG_SAFESEARCH = parsed;
+ }
+ }
+
+ // Include verify flag only when disabled to keep config minimal
+ if (values.verify_ssl === false) {
+ config.SEARXNG_VERIFY_SSL = false;
+ }
+
+ await createConnector(
+ {
+ name: values.name,
+ connector_type: EnumConnectorName.SEARXNG_API,
+ config,
+ is_indexable: false,
+ last_indexed_at: null,
+ },
+ parseInt(searchSpaceId)
+ );
+
+ toast.success("SearxNG connector created successfully!");
+ router.push(`/dashboard/${searchSpaceId}/connectors`);
+ } catch (error) {
+ console.error("Error creating SearxNG connector:", error);
+ toast.error(error instanceof Error ? error.message : "Failed to create connector");
+ } finally {
+ setIsSubmitting(false);
+ }
+ };
+
+ return (
+
+ Bring your self-hosted SearxNG meta-search engine into SurfSense.
+
+
+
+
+
+
+
+
+ Connect SearxNG
+
+ Integrate SurfSense with any SearxNG instance to broaden your search coverage while
+ preserving privacy and control.
+
+
+
+
+
+ SearxNG Instance Required
+
+ You need access to a running SearxNG instance. Refer to the{" "}
+
+ SearxNG installation guide
+ {" "}
+ for setup instructions. If your instance requires an API key, include it below.
+
+
+
+
+
+
+
+
+