Merge remote-tracking branch 'upstream/main' into feature/blocknote-editor

This commit is contained in:
Anish Sarkar 2025-11-30 04:10:49 +05:30
commit b98c312fb1
81 changed files with 8976 additions and 2387 deletions

View file

@ -38,19 +38,24 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
## Key Features
### 💡 **Idea**:
Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
- Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
### 📁 **Multiple File Format Uploading Support**
Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
- Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
### 🔍 **Powerful Search**
Quickly research or find anything in your saved content .
- Quickly research or find anything in your saved content .
### 💬 **Chat with your Saved Content**
Interact in Natural Language and get cited answers.
- Interact in Natural Language and get cited answers.
### 📄 **Cited Answers**
Get Cited answers just like Perplexity.
- Get Cited answers just like Perplexity.
### 🔔 **Privacy & Local LLM Support**
Works Flawlessly with Ollama local LLMs.
- Works Flawlessly with Ollama local LLMs.
### 🏠 **Self Hostable**
Open source and easy to deploy locally.
- Open source and easy to deploy locally.
### 👥 **Team Collaboration with RBAC**
- Role-Based Access Control for Search Spaces
- Invite team members with customizable roles (Owner, Admin, Editor, Viewer)
- Granular permissions for documents, chats, connectors, and settings
- Share knowledge bases securely within your organization
### 🎙️ Podcasts
- Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
- Convert your chat conversations into engaging audio content

View file

@ -39,25 +39,31 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7
## 核心功能
### 💡 **理念**:
拥有您自己的高度可定制的私有 NotebookLM 和 Perplexity并与外部数据源集成。
- 拥有您自己的高度可定制的私有 NotebookLM 和 Perplexity并与外部数据源集成。
### 📁 **支持多种文件格式上传**
将您个人文件中的内容(文档、图像、视频,支持 **50+ 种文件扩展名**)保存到您自己的个人知识库。
- 将您个人文件中的内容(文档、图像、视频,支持 **50+ 种文件扩展名**)保存到您自己的个人知识库。
### 🔍 **强大的搜索功能**
快速研究或查找已保存内容中的任何信息。
- 快速研究或查找已保存内容中的任何信息。
### 💬 **与已保存内容对话**
使用自然语言交互并获得引用答案。
- 使用自然语言交互并获得引用答案。
### 📄 **引用答案**
像 Perplexity 一样获得带引用的答案。
- 像 Perplexity 一样获得带引用的答案。
### 🔔 **隐私保护与本地 LLM 支持**
完美支持 Ollama 本地大语言模型。
- 完美支持 Ollama 本地大语言模型。
### 🏠 **可自托管**
开源且易于本地部署。
- 开源且易于本地部署。
### 👥 **团队协作与 RBAC**
- 搜索空间的基于角色的访问控制
- 使用可自定义的角色(所有者、管理员、编辑者、查看者)邀请团队成员
- 对文档、聊天、连接器和设置的细粒度权限控制
- 在组织内安全共享知识库
### 🎙️ **播客功能**
- 超快速播客生成代理(在 20 秒内创建 3 分钟播客)

View file

@ -8,6 +8,8 @@ Create Date: 2025-11-13 23:20:12.912741
from collections.abc import Sequence
from sqlalchemy import text
from alembic import op
# revision identifiers, used by Alembic.
@ -17,6 +19,20 @@ branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def constraint_exists(connection, table_name: str, constraint_name: str) -> bool:
"""Check if a constraint exists on the given table."""
result = connection.execute(
text(
"""
SELECT 1 FROM information_schema.table_constraints
WHERE table_name = :table_name AND constraint_name = :constraint_name
"""
),
{"table_name": table_name, "constraint_name": constraint_name},
)
return result.fetchone() is not None
def upgrade() -> None:
"""
Remove foreign key constraints on LLM preference columns to allow global configs (negative IDs).
@ -24,50 +40,55 @@ def upgrade() -> None:
Global LLM configs use negative IDs and don't exist in the llm_configs table,
so we need to remove the foreign key constraints that were preventing their use.
"""
# Drop the foreign key constraints
op.drop_constraint(
connection = op.get_bind()
# Drop the foreign key constraints if they exist
constraints_to_drop = [
"user_search_space_preferences_long_context_llm_id_fkey",
"user_search_space_preferences",
type_="foreignkey",
)
op.drop_constraint(
"user_search_space_preferences_fast_llm_id_fkey",
"user_search_space_preferences",
type_="foreignkey",
)
op.drop_constraint(
"user_search_space_preferences_strategic_llm_id_fkey",
"user_search_space_preferences",
type_="foreignkey",
)
]
for constraint_name in constraints_to_drop:
if constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.drop_constraint(
constraint_name,
"user_search_space_preferences",
type_="foreignkey",
)
else:
print(f"Constraint '{constraint_name}' does not exist. Skipping.")
def downgrade() -> None:
"""
Re-add foreign key constraints (will fail if any negative IDs exist in the table).
"""
# Re-add the foreign key constraints
op.create_foreign_key(
"user_search_space_preferences_long_context_llm_id_fkey",
"user_search_space_preferences",
"llm_configs",
["long_context_llm_id"],
["id"],
ondelete="SET NULL",
)
op.create_foreign_key(
"user_search_space_preferences_fast_llm_id_fkey",
"user_search_space_preferences",
"llm_configs",
["fast_llm_id"],
["id"],
ondelete="SET NULL",
)
op.create_foreign_key(
"user_search_space_preferences_strategic_llm_id_fkey",
"user_search_space_preferences",
"llm_configs",
["strategic_llm_id"],
["id"],
ondelete="SET NULL",
)
connection = op.get_bind()
# Re-add the foreign key constraints if they don't exist
constraints_to_create = [
(
"user_search_space_preferences_long_context_llm_id_fkey",
"long_context_llm_id",
),
("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"),
("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"),
]
for constraint_name, column_name in constraints_to_create:
if not constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.create_foreign_key(
constraint_name,
"user_search_space_preferences",
"llm_configs",
[column_name],
["id"],
ondelete="SET NULL",
)
else:
print(f"Constraint '{constraint_name}' already exists. Skipping.")

View file

@ -9,6 +9,7 @@ Create Date: 2025-11-19 00:00:00.000000
from collections.abc import Sequence
import sqlalchemy as sa
from sqlalchemy import text
from alembic import op
@ -19,24 +20,55 @@ branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def column_exists(connection, table_name: str, column_name: str) -> bool:
"""Check if a column exists on the given table."""
result = connection.execute(
text(
"""
SELECT 1 FROM information_schema.columns
WHERE table_name = :table_name AND column_name = :column_name
"""
),
{"table_name": table_name, "column_name": column_name},
)
return result.fetchone() is not None
def upgrade() -> None:
"""Add QnA configuration columns to searchspaces table."""
connection = op.get_bind()
# Add citations_enabled boolean (default True)
op.add_column(
"searchspaces",
sa.Column(
"citations_enabled", sa.Boolean(), nullable=False, server_default="true"
),
)
if not column_exists(connection, "searchspaces", "citations_enabled"):
op.add_column(
"searchspaces",
sa.Column(
"citations_enabled", sa.Boolean(), nullable=False, server_default="true"
),
)
else:
print("Column 'citations_enabled' already exists. Skipping.")
# Add custom instructions text field (nullable, defaults to empty)
op.add_column(
"searchspaces",
sa.Column("qna_custom_instructions", sa.Text(), nullable=True),
)
if not column_exists(connection, "searchspaces", "qna_custom_instructions"):
op.add_column(
"searchspaces",
sa.Column("qna_custom_instructions", sa.Text(), nullable=True),
)
else:
print("Column 'qna_custom_instructions' already exists. Skipping.")
def downgrade() -> None:
"""Remove QnA configuration columns from searchspaces table."""
op.drop_column("searchspaces", "qna_custom_instructions")
op.drop_column("searchspaces", "citations_enabled")
connection = op.get_bind()
if column_exists(connection, "searchspaces", "qna_custom_instructions"):
op.drop_column("searchspaces", "qna_custom_instructions")
else:
print("Column 'qna_custom_instructions' does not exist. Skipping.")
if column_exists(connection, "searchspaces", "citations_enabled"):
op.drop_column("searchspaces", "citations_enabled")
else:
print("Column 'citations_enabled' does not exist. Skipping.")

View file

@ -0,0 +1,59 @@
"""Add Webcrawler connector enums
Revision ID: 38
Revises: 37
Create Date: 2025-11-17 17:00:00.000000
"""
from collections.abc import Sequence
from alembic import op
revision: str = "38"
down_revision: str | None = "37"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Safely add 'WEBCRAWLER_CONNECTOR' to enum types if missing."""
# Add to searchsourceconnectortype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
) THEN
ALTER TYPE searchsourceconnectortype ADD VALUE 'WEBCRAWLER_CONNECTOR';
END IF;
END
$$;
"""
)
# Add to documenttype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
) THEN
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
END IF;
END
$$;
"""
)
def downgrade() -> None:
"""Remove 'WEBCRAWLER_CONNECTOR' from enum types."""
pass

View file

@ -0,0 +1,179 @@
"""Add RBAC tables for search space access control
Revision ID: 39
Revises: 38
Create Date: 2025-11-27 00:00:00.000000
This migration adds:
- Permission enum for granular access control
- search_space_roles table for custom roles per search space
- search_space_memberships table for user-searchspace-role relationships
- search_space_invites table for invite links
"""
from collections.abc import Sequence
from sqlalchemy import inspect
from alembic import op
revision: str = "39"
down_revision: str | None = "38"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Upgrade schema - add RBAC tables for search space access control."""
# Create search_space_roles table
op.execute(
"""
CREATE TABLE IF NOT EXISTS search_space_roles (
id SERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
name VARCHAR(100) NOT NULL,
description VARCHAR(500),
permissions TEXT[] NOT NULL DEFAULT '{}',
is_default BOOLEAN NOT NULL DEFAULT FALSE,
is_system_role BOOLEAN NOT NULL DEFAULT FALSE,
search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE,
CONSTRAINT uq_searchspace_role_name UNIQUE (search_space_id, name)
);
"""
)
# Create search_space_invites table (needs to be created before memberships due to FK)
op.execute(
"""
CREATE TABLE IF NOT EXISTS search_space_invites (
id SERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
invite_code VARCHAR(64) NOT NULL UNIQUE,
search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE,
role_id INTEGER REFERENCES search_space_roles(id) ON DELETE SET NULL,
created_by_id UUID REFERENCES "user"(id) ON DELETE SET NULL,
expires_at TIMESTAMPTZ,
max_uses INTEGER,
uses_count INTEGER NOT NULL DEFAULT 0,
is_active BOOLEAN NOT NULL DEFAULT TRUE,
name VARCHAR(100)
);
"""
)
# Create search_space_memberships table
op.execute(
"""
CREATE TABLE IF NOT EXISTS search_space_memberships (
id SERIAL PRIMARY KEY,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
user_id UUID NOT NULL REFERENCES "user"(id) ON DELETE CASCADE,
search_space_id INTEGER NOT NULL REFERENCES searchspaces(id) ON DELETE CASCADE,
role_id INTEGER REFERENCES search_space_roles(id) ON DELETE SET NULL,
is_owner BOOLEAN NOT NULL DEFAULT FALSE,
joined_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
invited_by_invite_id INTEGER REFERENCES search_space_invites(id) ON DELETE SET NULL,
CONSTRAINT uq_user_searchspace_membership UNIQUE (user_id, search_space_id)
);
"""
)
# Get connection and inspector for checking existing indexes
conn = op.get_bind()
inspector = inspect(conn)
# Create indexes for search_space_roles
existing_indexes = [
idx["name"] for idx in inspector.get_indexes("search_space_roles")
]
if "ix_search_space_roles_id" not in existing_indexes:
op.create_index("ix_search_space_roles_id", "search_space_roles", ["id"])
if "ix_search_space_roles_created_at" not in existing_indexes:
op.create_index(
"ix_search_space_roles_created_at", "search_space_roles", ["created_at"]
)
if "ix_search_space_roles_name" not in existing_indexes:
op.create_index("ix_search_space_roles_name", "search_space_roles", ["name"])
# Create indexes for search_space_memberships
existing_indexes = [
idx["name"] for idx in inspector.get_indexes("search_space_memberships")
]
if "ix_search_space_memberships_id" not in existing_indexes:
op.create_index(
"ix_search_space_memberships_id", "search_space_memberships", ["id"]
)
if "ix_search_space_memberships_created_at" not in existing_indexes:
op.create_index(
"ix_search_space_memberships_created_at",
"search_space_memberships",
["created_at"],
)
if "ix_search_space_memberships_user_id" not in existing_indexes:
op.create_index(
"ix_search_space_memberships_user_id",
"search_space_memberships",
["user_id"],
)
if "ix_search_space_memberships_search_space_id" not in existing_indexes:
op.create_index(
"ix_search_space_memberships_search_space_id",
"search_space_memberships",
["search_space_id"],
)
# Create indexes for search_space_invites
existing_indexes = [
idx["name"] for idx in inspector.get_indexes("search_space_invites")
]
if "ix_search_space_invites_id" not in existing_indexes:
op.create_index("ix_search_space_invites_id", "search_space_invites", ["id"])
if "ix_search_space_invites_created_at" not in existing_indexes:
op.create_index(
"ix_search_space_invites_created_at", "search_space_invites", ["created_at"]
)
if "ix_search_space_invites_invite_code" not in existing_indexes:
op.create_index(
"ix_search_space_invites_invite_code",
"search_space_invites",
["invite_code"],
)
def downgrade() -> None:
"""Downgrade schema - remove RBAC tables."""
# Drop indexes for search_space_memberships
op.drop_index(
"ix_search_space_memberships_search_space_id",
table_name="search_space_memberships",
)
op.drop_index(
"ix_search_space_memberships_user_id", table_name="search_space_memberships"
)
op.drop_index(
"ix_search_space_memberships_created_at", table_name="search_space_memberships"
)
op.drop_index(
"ix_search_space_memberships_id", table_name="search_space_memberships"
)
# Drop indexes for search_space_invites
op.drop_index(
"ix_search_space_invites_invite_code", table_name="search_space_invites"
)
op.drop_index(
"ix_search_space_invites_created_at", table_name="search_space_invites"
)
op.drop_index("ix_search_space_invites_id", table_name="search_space_invites")
# Drop indexes for search_space_roles
op.drop_index("ix_search_space_roles_name", table_name="search_space_roles")
op.drop_index("ix_search_space_roles_created_at", table_name="search_space_roles")
op.drop_index("ix_search_space_roles_id", table_name="search_space_roles")
# Drop tables in correct order (respecting foreign key constraints)
op.drop_table("search_space_memberships")
op.drop_table("search_space_invites")
op.drop_table("search_space_roles")

View file

@ -0,0 +1,63 @@
"""Move LLM preferences from user-level to search space level
Revision ID: 40
Revises: 39
Create Date: 2024-11-27
This migration moves LLM preferences (long_context_llm_id, fast_llm_id, strategic_llm_id)
from the user_search_space_preferences table to the searchspaces table itself.
This change supports the RBAC model where LLM preferences are shared by all members
of a search space, rather than being per-user.
"""
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision = "40"
down_revision = "39"
branch_labels = None
depends_on = None
def upgrade():
# Add LLM preference columns to searchspaces table
op.add_column(
"searchspaces",
sa.Column("long_context_llm_id", sa.Integer(), nullable=True),
)
op.add_column(
"searchspaces",
sa.Column("fast_llm_id", sa.Integer(), nullable=True),
)
op.add_column(
"searchspaces",
sa.Column("strategic_llm_id", sa.Integer(), nullable=True),
)
# Migrate existing preferences from user_search_space_preferences to searchspaces
# We take the owner's preferences (the user who created the search space)
connection = op.get_bind()
# Get all search spaces and their owner's preferences
connection.execute(
sa.text("""
UPDATE searchspaces ss
SET
long_context_llm_id = usp.long_context_llm_id,
fast_llm_id = usp.fast_llm_id,
strategic_llm_id = usp.strategic_llm_id
FROM user_search_space_preferences usp
WHERE ss.id = usp.search_space_id
AND ss.user_id = usp.user_id
""")
)
def downgrade():
# Remove LLM preference columns from searchspaces table
op.drop_column("searchspaces", "strategic_llm_id")
op.drop_column("searchspaces", "fast_llm_id")
op.drop_column("searchspaces", "long_context_llm_id")

View file

@ -0,0 +1,212 @@
"""Backfill RBAC data for existing search spaces
Revision ID: 41
Revises: 40
Create Date: 2025-11-28
This migration creates default roles and owner memberships for all existing
search spaces that were created before the RBAC system was implemented.
"""
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision = "41"
down_revision = "40"
branch_labels = None
depends_on = None
# Default role permissions (must match DEFAULT_ROLE_PERMISSIONS in db.py)
DEFAULT_ROLES = [
{
"name": "Owner",
"description": "Full access to all resources",
"permissions": ["*"],
"is_system_role": True,
"is_default": False,
},
{
"name": "Admin",
"description": "Can manage members, roles, and all content",
"permissions": [
"documents:create",
"documents:read",
"documents:update",
"documents:delete",
"chats:create",
"chats:read",
"chats:update",
"chats:delete",
"llm_configs:create",
"llm_configs:read",
"llm_configs:update",
"llm_configs:delete",
"logs:read",
"logs:delete",
"podcasts:create",
"podcasts:read",
"podcasts:update",
"podcasts:delete",
"connectors:create",
"connectors:read",
"connectors:update",
"connectors:delete",
"members:read",
"members:update",
"members:delete",
"roles:create",
"roles:read",
"roles:update",
"roles:delete",
"invites:create",
"invites:read",
"invites:delete",
"settings:read",
"settings:update",
],
"is_system_role": True,
"is_default": False,
},
{
"name": "Editor",
"description": "Can create and edit content",
"permissions": [
"documents:create",
"documents:read",
"documents:update",
"chats:create",
"chats:read",
"chats:update",
"llm_configs:read",
"logs:read",
"podcasts:create",
"podcasts:read",
"podcasts:update",
"connectors:create",
"connectors:read",
"connectors:update",
"members:read",
"roles:read",
],
"is_system_role": True,
"is_default": True,
},
{
"name": "Viewer",
"description": "Read-only access to content",
"permissions": [
"documents:read",
"chats:read",
"llm_configs:read",
"logs:read",
"podcasts:read",
"connectors:read",
"members:read",
"roles:read",
],
"is_system_role": True,
"is_default": False,
},
]
def upgrade():
connection = op.get_bind()
# Get all existing search spaces that don't have roles yet
search_spaces = connection.execute(
sa.text("""
SELECT ss.id, ss.user_id
FROM searchspaces ss
WHERE NOT EXISTS (
SELECT 1 FROM search_space_roles ssr
WHERE ssr.search_space_id = ss.id
)
""")
).fetchall()
for ss_id, owner_user_id in search_spaces:
owner_role_id = None
# Create default roles for each search space
for role in DEFAULT_ROLES:
# Convert permissions list to PostgreSQL array literal format for raw SQL
perms_literal = (
"ARRAY[" + ",".join(f"'{p}'" for p in role["permissions"]) + "]::TEXT[]"
)
result = connection.execute(
sa.text(f"""
INSERT INTO search_space_roles
(name, description, permissions, is_default, is_system_role, search_space_id)
VALUES (:name, :description, {perms_literal}, :is_default, :is_system_role, :search_space_id)
RETURNING id
"""),
{
"name": role["name"],
"description": role["description"],
"is_default": role["is_default"],
"is_system_role": role["is_system_role"],
"search_space_id": ss_id,
},
)
role_id = result.fetchone()[0]
# Keep track of Owner role ID
if role["name"] == "Owner":
owner_role_id = role_id
# Create owner membership for the search space creator
if owner_user_id and owner_role_id:
# Check if membership already exists
existing = connection.execute(
sa.text("""
SELECT 1 FROM search_space_memberships
WHERE user_id = :user_id AND search_space_id = :search_space_id
"""),
{"user_id": owner_user_id, "search_space_id": ss_id},
).fetchone()
if not existing:
connection.execute(
sa.text("""
INSERT INTO search_space_memberships
(user_id, search_space_id, role_id, is_owner)
VALUES (:user_id, :search_space_id, :role_id, TRUE)
"""),
{
"user_id": owner_user_id,
"search_space_id": ss_id,
"role_id": owner_role_id,
},
)
def downgrade():
# This migration only adds data, not schema changes
# Downgrade would remove all roles and memberships created by this migration
# However, this is destructive and may affect manually created data
# So we only remove system roles and owner memberships that were auto-created
connection = op.get_bind()
# Remove memberships where user is owner and role is system Owner role
connection.execute(
sa.text("""
DELETE FROM search_space_memberships ssm
USING search_space_roles ssr
WHERE ssm.role_id = ssr.id
AND ssm.is_owner = TRUE
AND ssr.is_system_role = TRUE
AND ssr.name = 'Owner'
""")
)
# Remove system roles
connection.execute(
sa.text("""
DELETE FROM search_space_roles
WHERE is_system_role = TRUE
""")
)

View file

@ -0,0 +1,52 @@
"""Drop user_search_space_preferences table
Revision ID: 42
Revises: 41
Create Date: 2025-11-28
This table is no longer needed after RBAC implementation:
- LLM preferences are now stored on SearchSpace directly
- User-SearchSpace relationships are handled by SearchSpaceMembership
"""
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision = "42"
down_revision = "41"
branch_labels = None
depends_on = None
def upgrade():
# Drop the user_search_space_preferences table
op.drop_table("user_search_space_preferences")
def downgrade():
# Recreate the table if rolling back
op.create_table(
"user_search_space_preferences",
sa.Column("id", sa.Integer(), primary_key=True),
sa.Column(
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
),
sa.Column(
"user_id",
sa.UUID(),
sa.ForeignKey("user.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"search_space_id",
sa.Integer(),
sa.ForeignKey("searchspaces.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("long_context_llm_id", sa.Integer(), nullable=True),
sa.Column("fast_llm_id", sa.Integer(), nullable=True),
sa.Column("strategic_llm_id", sa.Integer(), nullable=True),
sa.UniqueConstraint("user_id", "search_space_id", name="uq_user_searchspace"),
)

View file

@ -11,7 +11,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
# Additional imports for document fetching
from sqlalchemy.future import select
from app.db import Document, SearchSpace
from app.db import Document
from app.services.connector_service import ConnectorService
from app.services.query_service import QueryService
@ -92,19 +92,18 @@ def extract_sources_from_documents(
async def fetch_documents_by_ids(
document_ids: list[int], user_id: str, db_session: AsyncSession
document_ids: list[int], search_space_id: int, db_session: AsyncSession
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
"""
Fetch documents by their IDs with ownership check using DOCUMENTS mode approach.
Fetch documents by their IDs within a search space.
This function ensures that only documents belonging to the user are fetched,
providing security by checking ownership through SearchSpace association.
This function ensures that only documents belonging to the search space are fetched.
Similar to SearchMode.DOCUMENTS, it fetches full documents and concatenates their chunks.
Also creates source objects for UI display, grouped by document type.
Args:
document_ids: List of document IDs to fetch
user_id: The user ID to check ownership
search_space_id: The search space ID to filter by
db_session: The database session
Returns:
@ -114,11 +113,12 @@ async def fetch_documents_by_ids(
return [], []
try:
# Query documents with ownership check
# Query documents filtered by search space
result = await db_session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id.in_(document_ids), SearchSpace.user_id == user_id)
select(Document).filter(
Document.id.in_(document_ids),
Document.search_space_id == search_space_id,
)
)
documents = result.scalars().all()
@ -515,7 +515,6 @@ async def fetch_documents_by_ids(
async def fetch_relevant_documents(
research_questions: list[str],
user_id: str,
search_space_id: int,
db_session: AsyncSession,
connectors_to_search: list[str],
@ -536,7 +535,6 @@ async def fetch_relevant_documents(
Args:
research_questions: List of research questions to find documents for
user_id: The user ID
search_space_id: The search space ID
db_session: The database session
connectors_to_search: List of connectors to search
@ -619,7 +617,6 @@ async def fetch_relevant_documents(
youtube_chunks,
) = await connector_service.search_youtube(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -646,7 +643,6 @@ async def fetch_relevant_documents(
extension_chunks,
) = await connector_service.search_extension(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -673,7 +669,6 @@ async def fetch_relevant_documents(
crawled_urls_chunks,
) = await connector_service.search_crawled_urls(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -689,7 +684,7 @@ async def fetch_relevant_documents(
writer(
{
"yield_value": streaming_service.format_terminal_info_delta(
f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query"
f"🌐 Found {len(crawled_urls_chunks)} Web Page chunks related to your query"
)
}
)
@ -697,7 +692,6 @@ async def fetch_relevant_documents(
elif connector == "FILE":
source_object, files_chunks = await connector_service.search_files(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -721,7 +715,6 @@ async def fetch_relevant_documents(
elif connector == "SLACK_CONNECTOR":
source_object, slack_chunks = await connector_service.search_slack(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -748,7 +741,6 @@ async def fetch_relevant_documents(
notion_chunks,
) = await connector_service.search_notion(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -775,7 +767,6 @@ async def fetch_relevant_documents(
github_chunks,
) = await connector_service.search_github(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -802,7 +793,6 @@ async def fetch_relevant_documents(
linear_chunks,
) = await connector_service.search_linear(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -829,7 +819,6 @@ async def fetch_relevant_documents(
tavily_chunks,
) = await connector_service.search_tavily(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
)
@ -855,7 +844,6 @@ async def fetch_relevant_documents(
searx_chunks,
) = await connector_service.search_searxng(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
)
@ -881,7 +869,6 @@ async def fetch_relevant_documents(
linkup_chunks,
) = await connector_service.search_linkup(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
mode=linkup_mode,
)
@ -907,7 +894,6 @@ async def fetch_relevant_documents(
baidu_chunks,
) = await connector_service.search_baidu(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
)
@ -933,7 +919,6 @@ async def fetch_relevant_documents(
discord_chunks,
) = await connector_service.search_discord(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -955,7 +940,6 @@ async def fetch_relevant_documents(
elif connector == "JIRA_CONNECTOR":
source_object, jira_chunks = await connector_service.search_jira(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -981,7 +965,6 @@ async def fetch_relevant_documents(
calendar_chunks,
) = await connector_service.search_google_calendar(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1007,7 +990,6 @@ async def fetch_relevant_documents(
airtable_chunks,
) = await connector_service.search_airtable(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1033,7 +1015,6 @@ async def fetch_relevant_documents(
gmail_chunks,
) = await connector_service.search_google_gmail(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1059,7 +1040,6 @@ async def fetch_relevant_documents(
confluence_chunks,
) = await connector_service.search_confluence(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1085,7 +1065,6 @@ async def fetch_relevant_documents(
clickup_chunks,
) = await connector_service.search_clickup(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1112,7 +1091,6 @@ async def fetch_relevant_documents(
luma_chunks,
) = await connector_service.search_luma(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1139,7 +1117,6 @@ async def fetch_relevant_documents(
elasticsearch_chunks,
) = await connector_service.search_elasticsearch(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
@ -1315,7 +1292,6 @@ async def reformulate_user_query(
reformulated_query = await QueryService.reformulate_query_with_chat_history(
user_query=user_query,
session=state.db_session,
user_id=configuration.user_id,
search_space_id=configuration.search_space_id,
chat_history_str=chat_history_str,
)
@ -1389,7 +1365,7 @@ async def handle_qna_workflow(
user_selected_documents,
) = await fetch_documents_by_ids(
document_ids=configuration.document_ids_to_add_in_context,
user_id=configuration.user_id,
search_space_id=configuration.search_space_id,
db_session=state.db_session,
)
@ -1404,7 +1380,7 @@ async def handle_qna_workflow(
# Create connector service using state db_session
connector_service = ConnectorService(
state.db_session, user_id=configuration.user_id
state.db_session, search_space_id=configuration.search_space_id
)
await connector_service.initialize_counter()
@ -1413,7 +1389,6 @@ async def handle_qna_workflow(
relevant_documents = await fetch_relevant_documents(
research_questions=research_questions,
user_id=configuration.user_id,
search_space_id=configuration.search_space_id,
db_session=state.db_session,
connectors_to_search=configuration.connectors_to_search,
@ -1459,7 +1434,6 @@ async def handle_qna_workflow(
"user_query": user_query, # Use the reformulated query
"reformulated_query": reformulated_query,
"relevant_documents": all_documents, # Use combined documents
"user_id": configuration.user_id,
"search_space_id": configuration.search_space_id,
"language": configuration.language,
}
@ -1551,12 +1525,11 @@ async def generate_further_questions(
Returns:
Dict containing the further questions in the "further_questions" key for state update.
"""
from app.services.llm_service import get_user_fast_llm
from app.services.llm_service import get_fast_llm
# Get configuration and state data
configuration = Configuration.from_runnable_config(config)
chat_history = state.chat_history
user_id = configuration.user_id
search_space_id = configuration.search_space_id
streaming_service = state.streaming_service
@ -1571,10 +1544,10 @@ async def generate_further_questions(
}
)
# Get user's fast LLM
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
# Get search space's fast LLM
llm = await get_fast_llm(state.db_session, search_space_id)
if not llm:
error_message = f"No fast LLM configured for user {user_id} in search space {search_space_id}"
error_message = f"No fast LLM configured for search space {search_space_id}"
print(error_message)
writer({"yield_value": streaming_service.format_error(error_message)})

View file

@ -18,7 +18,6 @@ class Configuration:
relevant_documents: list[
Any
] # Documents provided directly to the agent for answering
user_id: str # User identifier
search_space_id: int # Search space identifier
language: str | None = None # Language for responses

View file

@ -17,7 +17,6 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
{chat_history_section}
<knowledge_sources>
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
@ -35,6 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
- TAVILY_API: "Tavily search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
</knowledge_sources>
<instructions>

View file

@ -142,13 +142,12 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
Returns:
Dict containing the final answer in the "final_answer" key.
"""
from app.services.llm_service import get_user_fast_llm
from app.services.llm_service import get_fast_llm
# Get configuration and relevant documents from configuration
configuration = Configuration.from_runnable_config(config)
documents = state.reranked_documents
user_query = configuration.user_query
user_id = configuration.user_id
search_space_id = configuration.search_space_id
language = configuration.language
@ -178,10 +177,10 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
else ""
)
# Get user's fast LLM
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
# Get search space's fast LLM
llm = await get_fast_llm(state.db_session, search_space_id)
if not llm:
error_message = f"No fast LLM configured for user {user_id} in search space {search_space_id}"
error_message = f"No fast LLM configured for search space {search_space_id}"
print(error_message)
raise RuntimeError(error_message)

View file

@ -19,7 +19,6 @@ def get_connector_emoji(connector_name: str) -> str:
connector_emojis = {
"YOUTUBE_VIDEO": "📹",
"EXTENSION": "🧩",
"CRAWLED_URL": "🌐",
"FILE": "📄",
"SLACK_CONNECTOR": "💬",
"NOTION_CONNECTOR": "📘",
@ -34,6 +33,7 @@ def get_connector_emoji(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "🗃️",
"LUMA_CONNECTOR": "",
"ELASTICSEARCH_CONNECTOR": "",
"WEBCRAWLER_CONNECTOR": "🌐",
}
return connector_emojis.get(connector_name, "🔎")
@ -43,7 +43,6 @@ def get_connector_friendly_name(connector_name: str) -> str:
connector_friendly_names = {
"YOUTUBE_VIDEO": "YouTube",
"EXTENSION": "Browser Extension",
"CRAWLED_URL": "Web Pages",
"FILE": "Files",
"SLACK_CONNECTOR": "Slack",
"NOTION_CONNECTOR": "Notion",
@ -59,6 +58,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "Airtable",
"LUMA_CONNECTOR": "Luma",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch",
"WEBCRAWLER_CONNECTOR": "Web Pages",
}
return connector_friendly_names.get(connector_name, connector_name)

View file

@ -208,9 +208,6 @@ class Config:
# LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Firecrawl API Key
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
# Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")

View file

@ -0,0 +1,188 @@
"""
WebCrawler Connector Module
A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
Provides a unified interface for web scraping.
"""
from typing import Any
import validators
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
class WebCrawlerConnector:
"""Class for crawling web pages and extracting content."""
def __init__(self, firecrawl_api_key: str | None = None):
"""
Initialize the WebCrawlerConnector class.
Args:
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
"""
self.firecrawl_api_key = firecrawl_api_key
self.use_firecrawl = bool(firecrawl_api_key)
def set_api_key(self, api_key: str) -> None:
"""
Set the Firecrawl API key and enable Firecrawl usage.
Args:
api_key: Firecrawl API key
"""
self.firecrawl_api_key = api_key
self.use_firecrawl = True
async def crawl_url(
self, url: str, formats: list[str] | None = None
) -> tuple[dict[str, Any] | None, str | None]:
"""
Crawl a single URL and extract its content.
Args:
url: URL to crawl
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
Returns:
Tuple containing (crawl result dict, error message or None)
Result dict contains:
- content: Extracted content (markdown or HTML)
- metadata: Page metadata (title, description, etc.)
- source: Original URL
- crawler_type: Type of crawler used
"""
try:
# Validate URL
if not validators.url(url):
return None, f"Invalid URL: {url}"
if self.use_firecrawl:
result = await self._crawl_with_firecrawl(url, formats)
else:
result = await self._crawl_with_chromium(url)
return result, None
except Exception as e:
return None, f"Error crawling URL {url}: {e!s}"
async def _crawl_with_firecrawl(
self, url: str, formats: list[str] | None = None
) -> dict[str, Any]:
"""
Crawl URL using Firecrawl.
Args:
url: URL to crawl
formats: List of formats to extract
Returns:
Dict containing crawled content and metadata
Raises:
ValueError: If Firecrawl scraping fails
"""
if not self.firecrawl_api_key:
raise ValueError("Firecrawl API key not set. Call set_api_key() first.")
firecrawl_app = AsyncFirecrawlApp(api_key=self.firecrawl_api_key)
# Default to markdown format
if formats is None:
formats = ["markdown"]
# v2 API returns Document directly and raises an exception on failure
scrape_result = await firecrawl_app.scrape(url, formats=formats)
if not scrape_result:
raise ValueError("Firecrawl returned no result")
# Extract content based on format
content = scrape_result.markdown or scrape_result.html or ""
# Extract metadata - v2 returns DocumentMetadata object
metadata_obj = scrape_result.metadata
metadata = metadata_obj.model_dump() if metadata_obj else {}
return {
"content": content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("source_url", url),
**metadata,
},
"crawler_type": "firecrawl",
}
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
"""
Crawl URL using AsyncChromiumLoader.
Args:
url: URL to crawl
Returns:
Dict containing crawled content and metadata
Raises:
Exception: If crawling fails
"""
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
documents = await crawl_loader.aload()
if not documents:
raise ValueError(f"Failed to load content from {url}")
doc = documents[0]
# Extract basic metadata from the document
metadata = doc.metadata if doc.metadata else {}
return {
"content": doc.page_content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
**metadata,
},
"crawler_type": "chromium",
}
def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
"""
Format crawl result as a structured document.
Args:
crawl_result: Result from crawl_url method
Returns:
Structured document string
"""
metadata = crawl_result["metadata"]
content = crawl_result["content"]
document_parts = ["<DOCUMENT>", "<METADATA>"]
# Add all metadata fields
for key, value in metadata.items():
document_parts.append(f"{key.upper()}: {value}")
document_parts.extend(
[
"</METADATA>",
"<CONTENT>",
"FORMAT: markdown",
"TEXT_START",
content,
"TEXT_END",
"</CONTENT>",
"</DOCUMENT>",
]
)
return "\n".join(document_parts)

View file

@ -73,6 +73,7 @@ class SearchSourceConnectorType(str, Enum):
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
class ChatType(str, Enum):
@ -130,6 +131,169 @@ class LogStatus(str, Enum):
FAILED = "FAILED"
class Permission(str, Enum):
"""
Granular permissions for search space resources.
Use '*' (FULL_ACCESS) to grant all permissions.
"""
# Documents
DOCUMENTS_CREATE = "documents:create"
DOCUMENTS_READ = "documents:read"
DOCUMENTS_UPDATE = "documents:update"
DOCUMENTS_DELETE = "documents:delete"
# Chats
CHATS_CREATE = "chats:create"
CHATS_READ = "chats:read"
CHATS_UPDATE = "chats:update"
CHATS_DELETE = "chats:delete"
# LLM Configs
LLM_CONFIGS_CREATE = "llm_configs:create"
LLM_CONFIGS_READ = "llm_configs:read"
LLM_CONFIGS_UPDATE = "llm_configs:update"
LLM_CONFIGS_DELETE = "llm_configs:delete"
# Podcasts
PODCASTS_CREATE = "podcasts:create"
PODCASTS_READ = "podcasts:read"
PODCASTS_UPDATE = "podcasts:update"
PODCASTS_DELETE = "podcasts:delete"
# Connectors
CONNECTORS_CREATE = "connectors:create"
CONNECTORS_READ = "connectors:read"
CONNECTORS_UPDATE = "connectors:update"
CONNECTORS_DELETE = "connectors:delete"
# Logs
LOGS_READ = "logs:read"
LOGS_DELETE = "logs:delete"
# Members
MEMBERS_INVITE = "members:invite"
MEMBERS_VIEW = "members:view"
MEMBERS_REMOVE = "members:remove"
MEMBERS_MANAGE_ROLES = "members:manage_roles"
# Roles
ROLES_CREATE = "roles:create"
ROLES_READ = "roles:read"
ROLES_UPDATE = "roles:update"
ROLES_DELETE = "roles:delete"
# Search Space Settings
SETTINGS_VIEW = "settings:view"
SETTINGS_UPDATE = "settings:update"
SETTINGS_DELETE = "settings:delete" # Delete the entire search space
# Full access wildcard
FULL_ACCESS = "*"
# Predefined role permission sets for convenience
DEFAULT_ROLE_PERMISSIONS = {
"Owner": [Permission.FULL_ACCESS.value],
"Admin": [
# Documents
Permission.DOCUMENTS_CREATE.value,
Permission.DOCUMENTS_READ.value,
Permission.DOCUMENTS_UPDATE.value,
Permission.DOCUMENTS_DELETE.value,
# Chats
Permission.CHATS_CREATE.value,
Permission.CHATS_READ.value,
Permission.CHATS_UPDATE.value,
Permission.CHATS_DELETE.value,
# LLM Configs
Permission.LLM_CONFIGS_CREATE.value,
Permission.LLM_CONFIGS_READ.value,
Permission.LLM_CONFIGS_UPDATE.value,
Permission.LLM_CONFIGS_DELETE.value,
# Podcasts
Permission.PODCASTS_CREATE.value,
Permission.PODCASTS_READ.value,
Permission.PODCASTS_UPDATE.value,
Permission.PODCASTS_DELETE.value,
# Connectors
Permission.CONNECTORS_CREATE.value,
Permission.CONNECTORS_READ.value,
Permission.CONNECTORS_UPDATE.value,
Permission.CONNECTORS_DELETE.value,
# Logs
Permission.LOGS_READ.value,
Permission.LOGS_DELETE.value,
# Members
Permission.MEMBERS_INVITE.value,
Permission.MEMBERS_VIEW.value,
Permission.MEMBERS_REMOVE.value,
Permission.MEMBERS_MANAGE_ROLES.value,
# Roles
Permission.ROLES_CREATE.value,
Permission.ROLES_READ.value,
Permission.ROLES_UPDATE.value,
Permission.ROLES_DELETE.value,
# Settings (no delete)
Permission.SETTINGS_VIEW.value,
Permission.SETTINGS_UPDATE.value,
],
"Editor": [
# Documents
Permission.DOCUMENTS_CREATE.value,
Permission.DOCUMENTS_READ.value,
Permission.DOCUMENTS_UPDATE.value,
Permission.DOCUMENTS_DELETE.value,
# Chats
Permission.CHATS_CREATE.value,
Permission.CHATS_READ.value,
Permission.CHATS_UPDATE.value,
Permission.CHATS_DELETE.value,
# LLM Configs (read only)
Permission.LLM_CONFIGS_READ.value,
Permission.LLM_CONFIGS_CREATE.value,
Permission.LLM_CONFIGS_UPDATE.value,
# Podcasts
Permission.PODCASTS_CREATE.value,
Permission.PODCASTS_READ.value,
Permission.PODCASTS_UPDATE.value,
Permission.PODCASTS_DELETE.value,
# Connectors (full access for editors)
Permission.CONNECTORS_CREATE.value,
Permission.CONNECTORS_READ.value,
Permission.CONNECTORS_UPDATE.value,
# Logs
Permission.LOGS_READ.value,
# Members (view only)
Permission.MEMBERS_VIEW.value,
# Roles (read only)
Permission.ROLES_READ.value,
# Settings (view only)
Permission.SETTINGS_VIEW.value,
],
"Viewer": [
# Documents (read only)
Permission.DOCUMENTS_READ.value,
# Chats (read only)
Permission.CHATS_READ.value,
# LLM Configs (read only)
Permission.LLM_CONFIGS_READ.value,
# Podcasts (read only)
Permission.PODCASTS_READ.value,
# Connectors (read only)
Permission.CONNECTORS_READ.value,
# Logs (read only)
Permission.LOGS_READ.value,
# Members (view only)
Permission.MEMBERS_VIEW.value,
# Roles (read only)
Permission.ROLES_READ.value,
# Settings (view only)
Permission.SETTINGS_VIEW.value,
],
}
class Base(DeclarativeBase):
pass
@ -240,6 +404,13 @@ class SearchSpace(BaseModel, TimestampMixin):
qna_custom_instructions = Column(
Text, nullable=True, default=""
) # User's custom instructions
# Search space-level LLM preferences (shared by all members)
# Note: These can be negative IDs for global configs (from YAML) or positive IDs for custom configs (from DB)
long_context_llm_id = Column(Integer, nullable=True)
fast_llm_id = Column(Integer, nullable=True)
strategic_llm_id = Column(Integer, nullable=True)
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
)
@ -281,9 +452,24 @@ class SearchSpace(BaseModel, TimestampMixin):
order_by="LLMConfig.id",
cascade="all, delete-orphan",
)
user_preferences = relationship(
"UserSearchSpacePreference",
# RBAC relationships
roles = relationship(
"SearchSpaceRole",
back_populates="search_space",
order_by="SearchSpaceRole.id",
cascade="all, delete-orphan",
)
memberships = relationship(
"SearchSpaceMembership",
back_populates="search_space",
order_by="SearchSpaceMembership.id",
cascade="all, delete-orphan",
)
invites = relationship(
"SearchSpaceInvite",
back_populates="search_space",
order_by="SearchSpaceInvite.id",
cascade="all, delete-orphan",
)
@ -347,45 +533,6 @@ class LLMConfig(BaseModel, TimestampMixin):
search_space = relationship("SearchSpace", back_populates="llm_configs")
class UserSearchSpacePreference(BaseModel, TimestampMixin):
__tablename__ = "user_search_space_preferences"
__table_args__ = (
UniqueConstraint(
"user_id",
"search_space_id",
name="uq_user_searchspace",
),
)
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
# User-specific LLM preferences for this search space
# Note: These can be negative IDs for global configs (from YAML) or positive IDs for custom configs (from DB)
# Foreign keys removed to support global configs with negative IDs
long_context_llm_id = Column(Integer, nullable=True)
fast_llm_id = Column(Integer, nullable=True)
strategic_llm_id = Column(Integer, nullable=True)
# Future RBAC fields can be added here
# role = Column(String(50), nullable=True) # e.g., 'owner', 'editor', 'viewer'
# permissions = Column(JSON, nullable=True)
user = relationship("User", back_populates="search_space_preferences")
search_space = relationship("SearchSpace", back_populates="user_preferences")
# Note: Relationships removed because foreign keys no longer exist
# Global configs (negative IDs) don't exist in llm_configs table
# Application code manually fetches configs when needed
# long_context_llm = relationship("LLMConfig", foreign_keys=[long_context_llm_id], post_update=True)
# fast_llm = relationship("LLMConfig", foreign_keys=[fast_llm_id], post_update=True)
# strategic_llm = relationship("LLMConfig", foreign_keys=[strategic_llm_id], post_update=True)
class Log(BaseModel, TimestampMixin):
__tablename__ = "logs"
@ -403,6 +550,140 @@ class Log(BaseModel, TimestampMixin):
search_space = relationship("SearchSpace", back_populates="logs")
class SearchSpaceRole(BaseModel, TimestampMixin):
"""
Custom roles that can be defined per search space.
Each search space can have multiple roles with different permission sets.
"""
__tablename__ = "search_space_roles"
__table_args__ = (
UniqueConstraint(
"search_space_id",
"name",
name="uq_searchspace_role_name",
),
)
name = Column(String(100), nullable=False, index=True)
description = Column(String(500), nullable=True)
# List of Permission enum values (e.g., ["documents:read", "chats:create"])
permissions = Column(ARRAY(String), nullable=False, default=[])
# Whether this role is assigned to new members by default when they join via invite
is_default = Column(Boolean, nullable=False, default=False)
# System roles (Owner, Admin, Editor, Viewer) cannot be deleted
is_system_role = Column(Boolean, nullable=False, default=False)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
search_space = relationship("SearchSpace", back_populates="roles")
memberships = relationship(
"SearchSpaceMembership", back_populates="role", passive_deletes=True
)
invites = relationship(
"SearchSpaceInvite", back_populates="role", passive_deletes=True
)
class SearchSpaceMembership(BaseModel, TimestampMixin):
"""
Tracks user membership in search spaces with their assigned role.
Each user can be a member of multiple search spaces with different roles.
"""
__tablename__ = "search_space_memberships"
__table_args__ = (
UniqueConstraint(
"user_id",
"search_space_id",
name="uq_user_searchspace_membership",
),
)
user_id = Column(
UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False
)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
role_id = Column(
Integer,
ForeignKey("search_space_roles.id", ondelete="SET NULL"),
nullable=True,
)
# Indicates if this user is the original creator/owner of the search space
is_owner = Column(Boolean, nullable=False, default=False)
# Timestamp when the user joined (via invite or as creator)
joined_at = Column(
TIMESTAMP(timezone=True),
nullable=False,
default=lambda: datetime.now(UTC),
)
# Reference to the invite used to join (null if owner/creator)
invited_by_invite_id = Column(
Integer,
ForeignKey("search_space_invites.id", ondelete="SET NULL"),
nullable=True,
)
user = relationship("User", back_populates="search_space_memberships")
search_space = relationship("SearchSpace", back_populates="memberships")
role = relationship("SearchSpaceRole", back_populates="memberships")
invited_by_invite = relationship(
"SearchSpaceInvite", back_populates="used_by_memberships"
)
class SearchSpaceInvite(BaseModel, TimestampMixin):
"""
Invite links for search spaces.
Users can create invite links with specific roles that others can use to join.
"""
__tablename__ = "search_space_invites"
# Unique invite code (used in invite URLs)
invite_code = Column(String(64), nullable=False, unique=True, index=True)
search_space_id = Column(
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
)
# Role to assign when invite is used (null means use default role)
role_id = Column(
Integer,
ForeignKey("search_space_roles.id", ondelete="SET NULL"),
nullable=True,
)
# User who created this invite
created_by_id = Column(
UUID(as_uuid=True),
ForeignKey("user.id", ondelete="SET NULL"),
nullable=True,
)
# Expiration timestamp (null means never expires)
expires_at = Column(TIMESTAMP(timezone=True), nullable=True)
# Maximum number of times this invite can be used (null means unlimited)
max_uses = Column(Integer, nullable=True)
# Number of times this invite has been used
uses_count = Column(Integer, nullable=False, default=0)
# Whether this invite is currently active
is_active = Column(Boolean, nullable=False, default=True)
# Optional custom name/label for the invite
name = Column(String(100), nullable=True)
search_space = relationship("SearchSpace", back_populates="invites")
role = relationship("SearchSpaceRole", back_populates="invites")
created_by = relationship("User", back_populates="created_invites")
used_by_memberships = relationship(
"SearchSpaceMembership",
back_populates="invited_by_invite",
passive_deletes=True,
)
if config.AUTH_TYPE == "GOOGLE":
class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base):
@ -413,11 +694,18 @@ if config.AUTH_TYPE == "GOOGLE":
"OAuthAccount", lazy="joined"
)
search_spaces = relationship("SearchSpace", back_populates="user")
search_space_preferences = relationship(
"UserSearchSpacePreference",
# RBAC relationships
search_space_memberships = relationship(
"SearchSpaceMembership",
back_populates="user",
cascade="all, delete-orphan",
)
created_invites = relationship(
"SearchSpaceInvite",
back_populates="created_by",
passive_deletes=True,
)
# Page usage tracking for ETL services
pages_limit = Column(Integer, nullable=False, default=500, server_default="500")
@ -427,11 +715,18 @@ else:
class User(SQLAlchemyBaseUserTableUUID, Base):
search_spaces = relationship("SearchSpace", back_populates="user")
search_space_preferences = relationship(
"UserSearchSpacePreference",
# RBAC relationships
search_space_memberships = relationship(
"SearchSpaceMembership",
back_populates="user",
cascade="all, delete-orphan",
)
created_invites = relationship(
"SearchSpaceInvite",
back_populates="created_by",
passive_deletes=True,
)
# Page usage tracking for ETL services
pages_limit = Column(Integer, nullable=False, default=500, server_default="500")
@ -502,3 +797,109 @@ async def get_documents_hybrid_search_retriever(
session: AsyncSession = Depends(get_async_session),
):
return DocumentHybridSearchRetriever(session)
def has_permission(user_permissions: list[str], required_permission: str) -> bool:
"""
Check if the user has the required permission.
Supports wildcard (*) for full access.
Args:
user_permissions: List of permission strings the user has
required_permission: The permission string to check for
Returns:
True if user has the permission, False otherwise
"""
if not user_permissions:
return False
# Full access wildcard grants all permissions
if Permission.FULL_ACCESS.value in user_permissions:
return True
return required_permission in user_permissions
def has_any_permission(
user_permissions: list[str], required_permissions: list[str]
) -> bool:
"""
Check if the user has any of the required permissions.
Args:
user_permissions: List of permission strings the user has
required_permissions: List of permission strings to check for (any match)
Returns:
True if user has at least one of the permissions, False otherwise
"""
if not user_permissions:
return False
if Permission.FULL_ACCESS.value in user_permissions:
return True
return any(perm in user_permissions for perm in required_permissions)
def has_all_permissions(
user_permissions: list[str], required_permissions: list[str]
) -> bool:
"""
Check if the user has all of the required permissions.
Args:
user_permissions: List of permission strings the user has
required_permissions: List of permission strings to check for (all must match)
Returns:
True if user has all of the permissions, False otherwise
"""
if not user_permissions:
return False
if Permission.FULL_ACCESS.value in user_permissions:
return True
return all(perm in user_permissions for perm in required_permissions)
def get_default_roles_config() -> list[dict]:
"""
Get the configuration for default system roles.
These roles are created automatically when a search space is created.
Returns:
List of role configurations with name, description, permissions, and flags
"""
return [
{
"name": "Owner",
"description": "Full access to all search space resources and settings",
"permissions": DEFAULT_ROLE_PERMISSIONS["Owner"],
"is_default": False,
"is_system_role": True,
},
{
"name": "Admin",
"description": "Can manage most resources except deleting the search space",
"permissions": DEFAULT_ROLE_PERMISSIONS["Admin"],
"is_default": False,
"is_system_role": True,
},
{
"name": "Editor",
"description": "Can create and edit documents, chats, and podcasts",
"permissions": DEFAULT_ROLE_PERMISSIONS["Editor"],
"is_default": True, # Default role for new members via invite
"is_system_role": True,
},
{
"name": "Viewer",
"description": "Read-only access to search space resources",
"permissions": DEFAULT_ROLE_PERMISSIONS["Viewer"],
"is_default": False,
"is_system_role": True,
},
]

View file

@ -12,8 +12,7 @@ class ChucksHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
) -> list:
"""
Perform vector similarity search on chunks.
@ -21,8 +20,7 @@ class ChucksHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
Returns:
List of chunks sorted by vector similarity
@ -31,25 +29,20 @@ class ChucksHybridSearchRetriever:
from sqlalchemy.orm import joinedload
from app.config import config
from app.db import Chunk, Document, SearchSpace
from app.db import Chunk, Document
# Get embedding for the query
embedding_model = config.embedding_model_instance
query_embedding = embedding_model.embed(query_text)
# Build the base query with user ownership check
# Build the query filtered by search space
query = (
select(Chunk)
.options(joinedload(Chunk.document).joinedload(Document.search_space))
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
.where(Document.search_space_id == search_space_id)
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add vector similarity ordering
query = query.order_by(Chunk.embedding.op("<=>")(query_embedding)).limit(top_k)
@ -63,8 +56,7 @@ class ChucksHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
) -> list:
"""
Perform full-text keyword search on chunks.
@ -72,8 +64,7 @@ class ChucksHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
Returns:
List of chunks sorted by text relevance
@ -81,28 +72,23 @@ class ChucksHybridSearchRetriever:
from sqlalchemy import func, select
from sqlalchemy.orm import joinedload
from app.db import Chunk, Document, SearchSpace
from app.db import Chunk, Document
# Create tsvector and tsquery for PostgreSQL full-text search
tsvector = func.to_tsvector("english", Chunk.content)
tsquery = func.plainto_tsquery("english", query_text)
# Build the base query with user ownership check
# Build the query filtered by search space
query = (
select(Chunk)
.options(joinedload(Chunk.document).joinedload(Document.search_space))
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
.where(Document.search_space_id == search_space_id)
.where(
tsvector.op("@@")(tsquery)
) # Only include results that match the query
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add text search ranking
query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k)
@ -116,8 +102,7 @@ class ChucksHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
document_type: str | None = None,
) -> list:
"""
@ -126,8 +111,7 @@ class ChucksHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
Returns:
@ -137,7 +121,7 @@ class ChucksHybridSearchRetriever:
from sqlalchemy.orm import joinedload
from app.config import config
from app.db import Chunk, Document, DocumentType, SearchSpace
from app.db import Chunk, Document, DocumentType
# Get embedding for the query
embedding_model = config.embedding_model_instance
@ -151,12 +135,8 @@ class ChucksHybridSearchRetriever:
tsvector = func.to_tsvector("english", Chunk.content)
tsquery = func.plainto_tsquery("english", query_text)
# Base conditions for document filtering
base_conditions = [SearchSpace.user_id == user_id]
# Add search space filter if provided
if search_space_id is not None:
base_conditions.append(Document.search_space_id == search_space_id)
# Base conditions for chunk filtering - search space is required
base_conditions = [Document.search_space_id == search_space_id]
# Add document type filter if provided
if document_type is not None:
@ -171,7 +151,7 @@ class ChucksHybridSearchRetriever:
else:
base_conditions.append(Document.document_type == document_type)
# CTE for semantic search with user ownership check
# CTE for semantic search filtered by search space
semantic_search_cte = (
select(
Chunk.id,
@ -180,7 +160,6 @@ class ChucksHybridSearchRetriever:
.label("rank"),
)
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
)
@ -190,7 +169,7 @@ class ChucksHybridSearchRetriever:
.cte("semantic_search")
)
# CTE for keyword search with user ownership check
# CTE for keyword search filtered by search space
keyword_search_cte = (
select(
Chunk.id,
@ -199,7 +178,6 @@ class ChucksHybridSearchRetriever:
.label("rank"),
)
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
.where(tsvector.op("@@")(tsquery))
)

View file

@ -12,8 +12,7 @@ class DocumentHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
) -> list:
"""
Perform vector similarity search on documents.
@ -21,8 +20,7 @@ class DocumentHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
Returns:
List of documents sorted by vector similarity
@ -31,24 +29,19 @@ class DocumentHybridSearchRetriever:
from sqlalchemy.orm import joinedload
from app.config import config
from app.db import Document, SearchSpace
from app.db import Document
# Get embedding for the query
embedding_model = config.embedding_model_instance
query_embedding = embedding_model.embed(query_text)
# Build the base query with user ownership check
# Build the query filtered by search space
query = (
select(Document)
.options(joinedload(Document.search_space))
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
.where(Document.search_space_id == search_space_id)
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add vector similarity ordering
query = query.order_by(Document.embedding.op("<=>")(query_embedding)).limit(
top_k
@ -64,8 +57,7 @@ class DocumentHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
) -> list:
"""
Perform full-text keyword search on documents.
@ -73,8 +65,7 @@ class DocumentHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
Returns:
List of documents sorted by text relevance
@ -82,27 +73,22 @@ class DocumentHybridSearchRetriever:
from sqlalchemy import func, select
from sqlalchemy.orm import joinedload
from app.db import Document, SearchSpace
from app.db import Document
# Create tsvector and tsquery for PostgreSQL full-text search
tsvector = func.to_tsvector("english", Document.content)
tsquery = func.plainto_tsquery("english", query_text)
# Build the base query with user ownership check
# Build the query filtered by search space
query = (
select(Document)
.options(joinedload(Document.search_space))
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
.where(Document.search_space_id == search_space_id)
.where(
tsvector.op("@@")(tsquery)
) # Only include results that match the query
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add text search ranking
query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k)
@ -116,8 +102,7 @@ class DocumentHybridSearchRetriever:
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
search_space_id: int,
document_type: str | None = None,
) -> list:
"""
@ -126,8 +111,7 @@ class DocumentHybridSearchRetriever:
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
search_space_id: The search space ID to search within
document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
"""
@ -135,7 +119,7 @@ class DocumentHybridSearchRetriever:
from sqlalchemy.orm import joinedload
from app.config import config
from app.db import Document, DocumentType, SearchSpace
from app.db import Document, DocumentType
# Get embedding for the query
embedding_model = config.embedding_model_instance
@ -149,12 +133,8 @@ class DocumentHybridSearchRetriever:
tsvector = func.to_tsvector("english", Document.content)
tsquery = func.plainto_tsquery("english", query_text)
# Base conditions for document filtering
base_conditions = [SearchSpace.user_id == user_id]
# Add search space filter if provided
if search_space_id is not None:
base_conditions.append(Document.search_space_id == search_space_id)
# Base conditions for document filtering - search space is required
base_conditions = [Document.search_space_id == search_space_id]
# Add document type filter if provided
if document_type is not None:
@ -169,17 +149,13 @@ class DocumentHybridSearchRetriever:
else:
base_conditions.append(Document.document_type == document_type)
# CTE for semantic search with user ownership check
semantic_search_cte = (
select(
Document.id,
func.rank()
.over(order_by=Document.embedding.op("<=>")(query_embedding))
.label("rank"),
)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
)
# CTE for semantic search filtered by search space
semantic_search_cte = select(
Document.id,
func.rank()
.over(order_by=Document.embedding.op("<=>")(query_embedding))
.label("rank"),
).where(*base_conditions)
semantic_search_cte = (
semantic_search_cte.order_by(Document.embedding.op("<=>")(query_embedding))
@ -187,7 +163,7 @@ class DocumentHybridSearchRetriever:
.cte("semantic_search")
)
# CTE for keyword search with user ownership check
# CTE for keyword search filtered by search space
keyword_search_cte = (
select(
Document.id,
@ -195,7 +171,6 @@ class DocumentHybridSearchRetriever:
.over(order_by=func.ts_rank_cd(tsvector, tsquery).desc())
.label("rank"),
)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
.where(tsvector.op("@@")(tsquery))
)

View file

@ -16,13 +16,14 @@ from .llm_config_routes import router as llm_config_router
from .logs_routes import router as logs_router
from .luma_add_connector_route import router as luma_add_connector_router
from .podcasts_routes import router as podcasts_router
from .rbac_routes import router as rbac_router
from .search_source_connectors_routes import router as search_source_connectors_router
from .search_spaces_routes import router as search_spaces_router
router = APIRouter()
router.include_router(search_spaces_router)
router.include_router(editor_router)
router.include_router(rbac_router) # RBAC routes for roles, members, invites
router.include_router(documents_router)
router.include_router(podcasts_router)
router.include_router(chats_router)

View file

@ -6,7 +6,14 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.db import Chat, SearchSpace, User, UserSearchSpacePreference, get_async_session
from app.db import (
Chat,
Permission,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
from app.schemas import (
AISDKChatRequest,
ChatCreate,
@ -16,7 +23,7 @@ from app.schemas import (
)
from app.tasks.stream_connector_search_results import stream_connector_search_results
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.rbac import check_permission
from app.utils.validators import (
validate_connectors,
validate_document_ids,
@ -59,45 +66,38 @@ async def handle_chat_data(
# print("RESQUEST DATA:", request_data)
# print("SELECTED CONNECTORS:", selected_connectors)
# Check if the search space belongs to the current user
# Check if the user has chat access to the search space
try:
await check_ownership(session, SearchSpace, search_space_id, user)
language_result = await session.execute(
select(UserSearchSpacePreference)
.options(
selectinload(UserSearchSpacePreference.search_space).selectinload(
SearchSpace.llm_configs
),
# Note: Removed selectinload for LLM relationships as they no longer exist
# Global configs (negative IDs) don't have foreign keys
# LLM configs are now fetched manually when needed
)
.filter(
UserSearchSpacePreference.search_space_id == search_space_id,
UserSearchSpacePreference.user_id == user.id,
)
await check_permission(
session,
user,
search_space_id,
Permission.CHATS_CREATE.value,
"You don't have permission to use chat in this search space",
)
user_preference = language_result.scalars().first()
# print("UserSearchSpacePreference:", user_preference)
# Get search space with LLM configs (preferences are now stored at search space level)
search_space_result = await session.execute(
select(SearchSpace)
.options(selectinload(SearchSpace.llm_configs))
.filter(SearchSpace.id == search_space_id)
)
search_space = search_space_result.scalars().first()
language = None
llm_configs = [] # Initialize to empty list
if (
user_preference
and user_preference.search_space
and user_preference.search_space.llm_configs
):
llm_configs = user_preference.search_space.llm_configs
if search_space and search_space.llm_configs:
llm_configs = search_space.llm_configs
# Manually fetch LLM configs since relationships no longer exist
# Check fast_llm, long_context_llm, and strategic_llm IDs
# Get language from configured LLM preferences
# LLM preferences are now stored on the SearchSpace model
from app.config import config as app_config
for llm_id in [
user_preference.fast_llm_id,
user_preference.long_context_llm_id,
user_preference.strategic_llm_id,
search_space.fast_llm_id,
search_space.long_context_llm_id,
search_space.strategic_llm_id,
]:
if llm_id is not None:
# Check if it's a global config (negative ID)
@ -161,8 +161,18 @@ async def create_chat(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Create a new chat.
Requires CHATS_CREATE permission.
"""
try:
await check_ownership(session, SearchSpace, chat.search_space_id, user)
await check_permission(
session,
user,
chat.search_space_id,
Permission.CHATS_CREATE.value,
"You don't have permission to create chats in this search space",
)
db_chat = Chat(**chat.model_dump())
session.add(db_chat)
await session.commit()
@ -197,6 +207,10 @@ async def read_chats(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
List chats the user has access to.
Requires CHATS_READ permission for the search space(s).
"""
# Validate pagination parameters
if skip < 0:
raise HTTPException(
@ -212,9 +226,17 @@ async def read_chats(
status_code=400, detail="search_space_id must be a positive integer"
)
try:
# Select specific fields excluding messages
query = (
select(
if search_space_id is not None:
# Check permission for specific search space
await check_permission(
session,
user,
search_space_id,
Permission.CHATS_READ.value,
"You don't have permission to read chats in this search space",
)
# Select specific fields excluding messages
query = select(
Chat.id,
Chat.type,
Chat.title,
@ -222,17 +244,28 @@ async def read_chats(
Chat.search_space_id,
Chat.created_at,
Chat.state_version,
).filter(Chat.search_space_id == search_space_id)
else:
# Get chats from all search spaces user has membership in
query = (
select(
Chat.id,
Chat.type,
Chat.title,
Chat.initial_connectors,
Chat.search_space_id,
Chat.created_at,
Chat.state_version,
)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
)
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
)
# Filter by search_space_id if provided
if search_space_id is not None:
query = query.filter(Chat.search_space_id == search_space_id)
result = await session.execute(query.offset(skip).limit(limit))
return result.all()
except HTTPException:
raise
except OperationalError:
raise HTTPException(
status_code=503, detail="Database operation failed. Please try again later."
@ -249,19 +282,32 @@ async def read_chat(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a specific chat by ID.
Requires CHATS_READ permission for the search space.
"""
try:
result = await session.execute(
select(Chat)
.join(SearchSpace)
.filter(Chat.id == chat_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Chat).filter(Chat.id == chat_id))
chat = result.scalars().first()
if not chat:
raise HTTPException(
status_code=404,
detail="Chat not found or you don't have permission to access it",
detail="Chat not found",
)
# Check permission for the search space
await check_permission(
session,
user,
chat.search_space_id,
Permission.CHATS_READ.value,
"You don't have permission to read chats in this search space",
)
return chat
except HTTPException:
raise
except OperationalError:
raise HTTPException(
status_code=503, detail="Database operation failed. Please try again later."
@ -280,8 +326,26 @@ async def update_chat(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Update a chat.
Requires CHATS_UPDATE permission for the search space.
"""
try:
db_chat = await read_chat(chat_id, session, user)
result = await session.execute(select(Chat).filter(Chat.id == chat_id))
db_chat = result.scalars().first()
if not db_chat:
raise HTTPException(status_code=404, detail="Chat not found")
# Check permission for the search space
await check_permission(
session,
user,
db_chat.search_space_id,
Permission.CHATS_UPDATE.value,
"You don't have permission to update chats in this search space",
)
update_data = chat_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
if key == "messages":
@ -318,8 +382,26 @@ async def delete_chat(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Delete a chat.
Requires CHATS_DELETE permission for the search space.
"""
try:
db_chat = await read_chat(chat_id, session, user)
result = await session.execute(select(Chat).filter(Chat.id == chat_id))
db_chat = result.scalars().first()
if not db_chat:
raise HTTPException(status_code=404, detail="Chat not found")
# Check permission for the search space
await check_permission(
session,
user,
db_chat.search_space_id,
Permission.CHATS_DELETE.value,
"You don't have permission to delete chats in this search space",
)
await session.delete(db_chat)
await session.commit()
return {"message": "Chat deleted successfully"}

View file

@ -10,7 +10,9 @@ from app.db import (
Chunk,
Document,
DocumentType,
Permission,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
@ -22,7 +24,7 @@ from app.schemas import (
PaginatedResponse,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.rbac import check_permission
try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@ -44,9 +46,19 @@ async def create_documents(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Create new documents.
Requires DOCUMENTS_CREATE permission.
"""
try:
# Check if the user owns the search space
await check_ownership(session, SearchSpace, request.search_space_id, user)
# Check permission
await check_permission(
session,
user,
request.search_space_id,
Permission.DOCUMENTS_CREATE.value,
"You don't have permission to create documents in this search space",
)
if request.document_type == DocumentType.EXTENSION:
from app.tasks.celery_tasks.document_tasks import (
@ -65,13 +77,6 @@ async def create_documents(
process_extension_document_task.delay(
document_dict, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.CRAWLED_URL:
from app.tasks.celery_tasks.document_tasks import process_crawled_url_task
for url in request.content:
process_crawled_url_task.delay(
url, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task
@ -100,8 +105,19 @@ async def create_documents_file_upload(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Upload files as documents.
Requires DOCUMENTS_CREATE permission.
"""
try:
await check_ownership(session, SearchSpace, search_space_id, user)
# Check permission
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_CREATE.value,
"You don't have permission to create documents in this search space",
)
if not files:
raise HTTPException(status_code=400, detail="No files provided")
@ -158,7 +174,8 @@ async def read_documents(
user: User = Depends(current_active_user),
):
"""
List documents owned by the current user, with optional filtering and pagination.
List documents the user has access to, with optional filtering and pagination.
Requires DOCUMENTS_READ permission for the search space(s).
Args:
skip: Absolute number of items to skip from the beginning. If provided, it takes precedence over 'page'.
@ -174,40 +191,49 @@ async def read_documents(
Notes:
- If both 'skip' and 'page' are provided, 'skip' is used.
- Results are scoped to documents owned by the current user.
- Results are scoped to documents in search spaces the user has membership in.
"""
try:
from sqlalchemy import func
query = (
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
)
# Filter by search_space_id if provided
# If specific search_space_id, check permission
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
query = select(Document).filter(Document.search_space_id == search_space_id)
count_query = (
select(func.count())
.select_from(Document)
.filter(Document.search_space_id == search_space_id)
)
else:
# Get documents from all search spaces user has membership in
query = (
select(Document)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
)
count_query = (
select(func.count())
.select_from(Document)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
)
# Filter by document_types if provided
if document_types is not None and document_types.strip():
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
if type_list:
query = query.filter(Document.document_type.in_(type_list))
# Get total count
count_query = (
select(func.count())
.select_from(Document)
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
)
if search_space_id is not None:
count_query = count_query.filter(
Document.search_space_id == search_space_id
)
if document_types is not None and document_types.strip():
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
if type_list:
count_query = count_query.filter(Document.document_type.in_(type_list))
total_result = await session.execute(count_query)
total = total_result.scalar() or 0
@ -242,6 +268,8 @@ async def read_documents(
)
return PaginatedResponse(items=api_documents, total=total)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch documents: {e!s}"
@ -261,6 +289,7 @@ async def search_documents(
):
"""
Search documents by title substring, optionally filtered by search_space_id and document_types.
Requires DOCUMENTS_READ permission for the search space(s).
Args:
title: Case-insensitive substring to match against document titles. Required.
@ -282,37 +311,48 @@ async def search_documents(
try:
from sqlalchemy import func
query = (
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
)
# If specific search_space_id, check permission
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
query = select(Document).filter(Document.search_space_id == search_space_id)
count_query = (
select(func.count())
.select_from(Document)
.filter(Document.search_space_id == search_space_id)
)
else:
# Get documents from all search spaces user has membership in
query = (
select(Document)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
)
count_query = (
select(func.count())
.select_from(Document)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
)
# Only search by title (case-insensitive)
query = query.filter(Document.title.ilike(f"%{title}%"))
count_query = count_query.filter(Document.title.ilike(f"%{title}%"))
# Filter by document_types if provided
if document_types is not None and document_types.strip():
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
if type_list:
query = query.filter(Document.document_type.in_(type_list))
# Get total count
count_query = (
select(func.count())
.select_from(Document)
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
)
if search_space_id is not None:
count_query = count_query.filter(
Document.search_space_id == search_space_id
)
count_query = count_query.filter(Document.title.ilike(f"%{title}%"))
if document_types is not None and document_types.strip():
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
if type_list:
count_query = count_query.filter(Document.document_type.in_(type_list))
total_result = await session.execute(count_query)
total = total_result.scalar() or 0
@ -347,6 +387,8 @@ async def search_documents(
)
return PaginatedResponse(items=api_documents, total=total)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to search documents: {e!s}"
@ -360,7 +402,8 @@ async def get_document_type_counts(
user: User = Depends(current_active_user),
):
"""
Get counts of documents by type for the current user.
Get counts of documents by type for search spaces the user has access to.
Requires DOCUMENTS_READ permission for the search space(s).
Args:
search_space_id: If provided, restrict counts to a specific search space.
@ -373,20 +416,36 @@ async def get_document_type_counts(
try:
from sqlalchemy import func
query = (
select(Document.document_type, func.count(Document.id))
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.group_by(Document.document_type)
)
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
# Check permission for specific search space
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
query = (
select(Document.document_type, func.count(Document.id))
.filter(Document.search_space_id == search_space_id)
.group_by(Document.document_type)
)
else:
# Get counts from all search spaces user has membership in
query = (
select(Document.document_type, func.count(Document.id))
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.group_by(Document.document_type)
)
result = await session.execute(query)
type_counts = dict(result.all())
return type_counts
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch document type counts: {e!s}"
@ -401,6 +460,7 @@ async def get_document_by_chunk_id(
):
"""
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
Requires DOCUMENTS_READ permission for the search space.
The document's embedding and chunk embeddings are excluded from the response.
"""
try:
@ -413,21 +473,29 @@ async def get_document_by_chunk_id(
status_code=404, detail=f"Chunk with id {chunk_id} not found"
)
# Get the associated document and verify ownership
# Get the associated document
document_result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
.join(SearchSpace)
.filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id)
.filter(Document.id == chunk.document_id)
)
document = document_result.scalars().first()
if not document:
raise HTTPException(
status_code=404,
detail="Document not found or you don't have access to it",
detail="Document not found",
)
# Check permission for the search space
await check_permission(
session,
user,
document.search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
# Sort chunks by creation time
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
@ -456,11 +524,13 @@ async def read_document(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a specific document by ID.
Requires DOCUMENTS_READ permission for the search space.
"""
try:
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
select(Document).filter(Document.id == document_id)
)
document = result.scalars().first()
@ -469,6 +539,15 @@ async def read_document(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Check permission for the search space
await check_permission(
session,
user,
document.search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
# Convert database object to API-friendly format
return DocumentRead(
id=document.id,
@ -479,6 +558,8 @@ async def read_document(
created_at=document.created_at,
search_space_id=document.search_space_id,
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch document: {e!s}"
@ -492,12 +573,13 @@ async def update_document(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Update a document.
Requires DOCUMENTS_UPDATE permission for the search space.
"""
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
select(Document).filter(Document.id == document_id)
)
db_document = result.scalars().first()
@ -506,6 +588,15 @@ async def update_document(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Check permission for the search space
await check_permission(
session,
user,
db_document.search_space_id,
Permission.DOCUMENTS_UPDATE.value,
"You don't have permission to update documents in this search space",
)
update_data = document_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(db_document, key, value)
@ -537,12 +628,13 @@ async def delete_document(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Delete a document.
Requires DOCUMENTS_DELETE permission for the search space.
"""
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
select(Document).filter(Document.id == document_id)
)
document = result.scalars().first()
@ -551,6 +643,15 @@ async def delete_document(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Check permission for the search space
await check_permission(
session,
user,
document.search_space_id,
Permission.DOCUMENTS_DELETE.value,
"You don't have permission to delete documents in this search space",
)
await session.delete(document)
await session.commit()
return {"message": "Document deleted successfully"}

View file

@ -8,67 +8,22 @@ from sqlalchemy.future import select
from app.config import config
from app.db import (
LLMConfig,
Permission,
SearchSpace,
User,
UserSearchSpacePreference,
get_async_session,
)
from app.schemas import LLMConfigCreate, LLMConfigRead, LLMConfigUpdate
from app.services.llm_service import validate_llm_config
from app.users import current_active_user
from app.utils.rbac import check_permission
router = APIRouter()
logger = logging.getLogger(__name__)
# Helper function to check search space access
async def check_search_space_access(
session: AsyncSession, search_space_id: int, user: User
) -> SearchSpace:
"""Verify that the user has access to the search space"""
result = await session.execute(
select(SearchSpace).filter(
SearchSpace.id == search_space_id, SearchSpace.user_id == user.id
)
)
search_space = result.scalars().first()
if not search_space:
raise HTTPException(
status_code=404,
detail="Search space not found or you don't have permission to access it",
)
return search_space
# Helper function to get or create user search space preference
async def get_or_create_user_preference(
session: AsyncSession, user_id, search_space_id: int
) -> UserSearchSpacePreference:
"""Get or create user preference for a search space"""
result = await session.execute(
select(UserSearchSpacePreference).filter(
UserSearchSpacePreference.user_id == user_id,
UserSearchSpacePreference.search_space_id == search_space_id,
)
# Removed selectinload options since relationships no longer exist
)
preference = result.scalars().first()
if not preference:
# Create new preference entry
preference = UserSearchSpacePreference(
user_id=user_id,
search_space_id=search_space_id,
)
session.add(preference)
await session.commit()
await session.refresh(preference)
return preference
class LLMPreferencesUpdate(BaseModel):
"""Schema for updating user LLM preferences"""
"""Schema for updating search space LLM preferences"""
long_context_llm_id: int | None = None
fast_llm_id: int | None = None
@ -76,7 +31,7 @@ class LLMPreferencesUpdate(BaseModel):
class LLMPreferencesRead(BaseModel):
"""Schema for reading user LLM preferences"""
"""Schema for reading search space LLM preferences"""
long_context_llm_id: int | None = None
fast_llm_id: int | None = None
@ -144,10 +99,19 @@ async def create_llm_config(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Create a new LLM configuration for a search space"""
"""
Create a new LLM configuration for a search space.
Requires LLM_CONFIGS_CREATE permission.
"""
try:
# Verify user has access to the search space
await check_search_space_access(session, llm_config.search_space_id, user)
# Verify user has permission to create LLM configs
await check_permission(
session,
user,
llm_config.search_space_id,
Permission.LLM_CONFIGS_CREATE.value,
"You don't have permission to create LLM configurations in this search space",
)
# Validate the LLM configuration by making a test API call
is_valid, error_message = await validate_llm_config(
@ -187,10 +151,19 @@ async def read_llm_configs(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get all LLM configurations for a search space"""
"""
Get all LLM configurations for a search space.
Requires LLM_CONFIGS_READ permission.
"""
try:
# Verify user has access to the search space
await check_search_space_access(session, search_space_id, user)
# Verify user has permission to read LLM configs
await check_permission(
session,
user,
search_space_id,
Permission.LLM_CONFIGS_READ.value,
"You don't have permission to view LLM configurations in this search space",
)
result = await session.execute(
select(LLMConfig)
@ -213,7 +186,10 @@ async def read_llm_config(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get a specific LLM configuration by ID"""
"""
Get a specific LLM configuration by ID.
Requires LLM_CONFIGS_READ permission.
"""
try:
# Get the LLM config
result = await session.execute(
@ -224,8 +200,14 @@ async def read_llm_config(
if not llm_config:
raise HTTPException(status_code=404, detail="LLM configuration not found")
# Verify user has access to the search space
await check_search_space_access(session, llm_config.search_space_id, user)
# Verify user has permission to read LLM configs
await check_permission(
session,
user,
llm_config.search_space_id,
Permission.LLM_CONFIGS_READ.value,
"You don't have permission to view LLM configurations in this search space",
)
return llm_config
except HTTPException:
@ -243,7 +225,10 @@ async def update_llm_config(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Update an existing LLM configuration"""
"""
Update an existing LLM configuration.
Requires LLM_CONFIGS_UPDATE permission.
"""
try:
# Get the LLM config
result = await session.execute(
@ -254,8 +239,14 @@ async def update_llm_config(
if not db_llm_config:
raise HTTPException(status_code=404, detail="LLM configuration not found")
# Verify user has access to the search space
await check_search_space_access(session, db_llm_config.search_space_id, user)
# Verify user has permission to update LLM configs
await check_permission(
session,
user,
db_llm_config.search_space_id,
Permission.LLM_CONFIGS_UPDATE.value,
"You don't have permission to update LLM configurations in this search space",
)
update_data = llm_config_update.model_dump(exclude_unset=True)
@ -311,7 +302,10 @@ async def delete_llm_config(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Delete an LLM configuration"""
"""
Delete an LLM configuration.
Requires LLM_CONFIGS_DELETE permission.
"""
try:
# Get the LLM config
result = await session.execute(
@ -322,8 +316,14 @@ async def delete_llm_config(
if not db_llm_config:
raise HTTPException(status_code=404, detail="LLM configuration not found")
# Verify user has access to the search space
await check_search_space_access(session, db_llm_config.search_space_id, user)
# Verify user has permission to delete LLM configs
await check_permission(
session,
user,
db_llm_config.search_space_id,
Permission.LLM_CONFIGS_DELETE.value,
"You don't have permission to delete LLM configurations in this search space",
)
await session.delete(db_llm_config)
await session.commit()
@ -337,28 +337,42 @@ async def delete_llm_config(
) from e
# User LLM Preferences endpoints
# Search Space LLM Preferences endpoints
@router.get(
"/search-spaces/{search_space_id}/llm-preferences",
response_model=LLMPreferencesRead,
)
async def get_user_llm_preferences(
async def get_llm_preferences(
search_space_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get the current user's LLM preferences for a specific search space"""
"""
Get the LLM preferences for a specific search space.
LLM preferences are shared by all members of the search space.
Requires LLM_CONFIGS_READ permission.
"""
try:
# Verify user has access to the search space
await check_search_space_access(session, search_space_id, user)
# Get or create user preference for this search space
preference = await get_or_create_user_preference(
session, user.id, search_space_id
# Verify user has permission to read LLM configs
await check_permission(
session,
user,
search_space_id,
Permission.LLM_CONFIGS_READ.value,
"You don't have permission to view LLM preferences in this search space",
)
# Get the search space
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
raise HTTPException(status_code=404, detail="Search space not found")
# Helper function to get config (global or custom)
async def get_config_for_id(config_id):
if config_id is None:
@ -391,14 +405,14 @@ async def get_user_llm_preferences(
return result.scalars().first()
# Get the configs (from DB for custom, or constructed for global)
long_context_llm = await get_config_for_id(preference.long_context_llm_id)
fast_llm = await get_config_for_id(preference.fast_llm_id)
strategic_llm = await get_config_for_id(preference.strategic_llm_id)
long_context_llm = await get_config_for_id(search_space.long_context_llm_id)
fast_llm = await get_config_for_id(search_space.fast_llm_id)
strategic_llm = await get_config_for_id(search_space.strategic_llm_id)
return {
"long_context_llm_id": preference.long_context_llm_id,
"fast_llm_id": preference.fast_llm_id,
"strategic_llm_id": preference.strategic_llm_id,
"long_context_llm_id": search_space.long_context_llm_id,
"fast_llm_id": search_space.fast_llm_id,
"strategic_llm_id": search_space.strategic_llm_id,
"long_context_llm": long_context_llm,
"fast_llm": fast_llm,
"strategic_llm": strategic_llm,
@ -415,22 +429,37 @@ async def get_user_llm_preferences(
"/search-spaces/{search_space_id}/llm-preferences",
response_model=LLMPreferencesRead,
)
async def update_user_llm_preferences(
async def update_llm_preferences(
search_space_id: int,
preferences: LLMPreferencesUpdate,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Update the current user's LLM preferences for a specific search space"""
"""
Update the LLM preferences for a specific search space.
LLM preferences are shared by all members of the search space.
Requires SETTINGS_UPDATE permission (only users with settings access can change).
"""
try:
# Verify user has access to the search space
await check_search_space_access(session, search_space_id, user)
# Get or create user preference for this search space
preference = await get_or_create_user_preference(
session, user.id, search_space_id
# Verify user has permission to update settings (not just LLM configs)
# This ensures only users with settings access can change shared LLM preferences
await check_permission(
session,
user,
search_space_id,
Permission.SETTINGS_UPDATE.value,
"You don't have permission to update LLM preferences in this search space",
)
# Get the search space
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
raise HTTPException(status_code=404, detail="Search space not found")
# Validate that all provided LLM config IDs belong to the search space
update_data = preferences.model_dump(exclude_unset=True)
@ -485,18 +514,13 @@ async def update_user_llm_preferences(
f"Multiple languages detected in LLM selection for search_space {search_space_id}: {languages}. "
"This may affect response quality."
)
# Don't raise an exception - allow users to proceed
# raise HTTPException(
# status_code=400,
# detail="All selected LLM configurations must have the same language setting",
# )
# Update user preferences
# Update search space LLM preferences
for key, value in update_data.items():
setattr(preference, key, value)
setattr(search_space, key, value)
await session.commit()
await session.refresh(preference)
await session.refresh(search_space)
# Helper function to get config (global or custom)
async def get_config_for_id(config_id):
@ -530,15 +554,15 @@ async def update_user_llm_preferences(
return result.scalars().first()
# Get the configs (from DB for custom, or constructed for global)
long_context_llm = await get_config_for_id(preference.long_context_llm_id)
fast_llm = await get_config_for_id(preference.fast_llm_id)
strategic_llm = await get_config_for_id(preference.strategic_llm_id)
long_context_llm = await get_config_for_id(search_space.long_context_llm_id)
fast_llm = await get_config_for_id(search_space.fast_llm_id)
strategic_llm = await get_config_for_id(search_space.strategic_llm_id)
# Return updated preferences
return {
"long_context_llm_id": preference.long_context_llm_id,
"fast_llm_id": preference.fast_llm_id,
"strategic_llm_id": preference.strategic_llm_id,
"long_context_llm_id": search_space.long_context_llm_id,
"fast_llm_id": search_space.fast_llm_id,
"strategic_llm_id": search_space.strategic_llm_id,
"long_context_llm": long_context_llm,
"fast_llm": fast_llm,
"strategic_llm": strategic_llm,

View file

@ -5,10 +5,19 @@ from sqlalchemy import and_, desc
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import Log, LogLevel, LogStatus, SearchSpace, User, get_async_session
from app.db import (
Log,
LogLevel,
LogStatus,
Permission,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
from app.schemas import LogCreate, LogRead, LogUpdate
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.rbac import check_permission
router = APIRouter()
@ -19,10 +28,19 @@ async def create_log(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Create a new log entry."""
"""
Create a new log entry.
Note: This is typically called internally. Requires LOGS_READ permission (since logs are usually system-generated).
"""
try:
# Check if the user owns the search space
await check_ownership(session, SearchSpace, log.search_space_id, user)
# Check if the user has access to the search space
await check_permission(
session,
user,
log.search_space_id,
Permission.LOGS_READ.value,
"You don't have permission to access logs in this search space",
)
db_log = Log(**log.model_dump())
session.add(db_log)
@ -51,22 +69,38 @@ async def read_logs(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get logs with optional filtering."""
"""
Get logs with optional filtering.
Requires LOGS_READ permission for the search space(s).
"""
try:
# Build base query - only logs from user's search spaces
query = (
select(Log)
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.order_by(desc(Log.created_at)) # Most recent first
)
# Apply filters
filters = []
if search_space_id is not None:
await check_ownership(session, SearchSpace, search_space_id, user)
filters.append(Log.search_space_id == search_space_id)
# Check permission for specific search space
await check_permission(
session,
user,
search_space_id,
Permission.LOGS_READ.value,
"You don't have permission to read logs in this search space",
)
# Build query for specific search space
query = (
select(Log)
.filter(Log.search_space_id == search_space_id)
.order_by(desc(Log.created_at))
)
else:
# Build base query - logs from search spaces user has membership in
query = (
select(Log)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.order_by(desc(Log.created_at))
)
if level is not None:
filters.append(Log.level == level)
@ -104,19 +138,26 @@ async def read_log(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get a specific log by ID."""
"""
Get a specific log by ID.
Requires LOGS_READ permission for the search space.
"""
try:
# Get log and verify user owns the search space
result = await session.execute(
select(Log)
.join(SearchSpace)
.filter(Log.id == log_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Log).filter(Log.id == log_id))
log = result.scalars().first()
if not log:
raise HTTPException(status_code=404, detail="Log not found")
# Check permission for the search space
await check_permission(
session,
user,
log.search_space_id,
Permission.LOGS_READ.value,
"You don't have permission to read logs in this search space",
)
return log
except HTTPException:
raise
@ -133,19 +174,26 @@ async def update_log(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Update a log entry."""
"""
Update a log entry.
Requires LOGS_READ permission (logs are typically updated by system).
"""
try:
# Get log and verify user owns the search space
result = await session.execute(
select(Log)
.join(SearchSpace)
.filter(Log.id == log_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Log).filter(Log.id == log_id))
db_log = result.scalars().first()
if not db_log:
raise HTTPException(status_code=404, detail="Log not found")
# Check permission for the search space
await check_permission(
session,
user,
db_log.search_space_id,
Permission.LOGS_READ.value,
"You don't have permission to access logs in this search space",
)
# Update only provided fields
update_data = log_update.model_dump(exclude_unset=True)
for field, value in update_data.items():
@ -169,19 +217,26 @@ async def delete_log(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Delete a log entry."""
"""
Delete a log entry.
Requires LOGS_DELETE permission for the search space.
"""
try:
# Get log and verify user owns the search space
result = await session.execute(
select(Log)
.join(SearchSpace)
.filter(Log.id == log_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Log).filter(Log.id == log_id))
db_log = result.scalars().first()
if not db_log:
raise HTTPException(status_code=404, detail="Log not found")
# Check permission for the search space
await check_permission(
session,
user,
db_log.search_space_id,
Permission.LOGS_DELETE.value,
"You don't have permission to delete logs in this search space",
)
await session.delete(db_log)
await session.commit()
return {"message": "Log deleted successfully"}
@ -201,10 +256,19 @@ async def get_logs_summary(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get a summary of logs for a search space in the last X hours."""
"""
Get a summary of logs for a search space in the last X hours.
Requires LOGS_READ permission for the search space.
"""
try:
# Check ownership
await check_ownership(session, SearchSpace, search_space_id, user)
# Check permission
await check_permission(
session,
user,
search_space_id,
Permission.LOGS_READ.value,
"You don't have permission to read logs in this search space",
)
# Calculate time window
since = datetime.utcnow().replace(microsecond=0) - timedelta(hours=hours)

View file

@ -7,7 +7,15 @@ from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import Chat, Podcast, SearchSpace, User, get_async_session
from app.db import (
Chat,
Permission,
Podcast,
SearchSpace,
SearchSpaceMembership,
User,
get_async_session,
)
from app.schemas import (
PodcastCreate,
PodcastGenerateRequest,
@ -16,7 +24,7 @@ from app.schemas import (
)
from app.tasks.podcast_tasks import generate_chat_podcast
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.rbac import check_permission
router = APIRouter()
@ -27,8 +35,18 @@ async def create_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Create a new podcast.
Requires PODCASTS_CREATE permission.
"""
try:
await check_ownership(session, SearchSpace, podcast.search_space_id, user)
await check_permission(
session,
user,
podcast.search_space_id,
Permission.PODCASTS_CREATE.value,
"You don't have permission to create podcasts in this search space",
)
db_podcast = Podcast(**podcast.model_dump())
session.add(db_podcast)
await session.commit()
@ -58,20 +76,45 @@ async def create_podcast(
async def read_podcasts(
skip: int = 0,
limit: int = 100,
search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
List podcasts the user has access to.
Requires PODCASTS_READ permission for the search space(s).
"""
if skip < 0 or limit < 1:
raise HTTPException(status_code=400, detail="Invalid pagination parameters")
try:
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.offset(skip)
.limit(limit)
)
if search_space_id is not None:
# Check permission for specific search space
await check_permission(
session,
user,
search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to read podcasts in this search space",
)
result = await session.execute(
select(Podcast)
.filter(Podcast.search_space_id == search_space_id)
.offset(skip)
.limit(limit)
)
else:
# Get podcasts from all search spaces user has membership in
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.offset(skip)
.limit(limit)
)
return result.scalars().all()
except HTTPException:
raise
except SQLAlchemyError:
raise HTTPException(
status_code=500, detail="Database error occurred while fetching podcasts"
@ -84,18 +127,29 @@ async def read_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a specific podcast by ID.
Requires PODCASTS_READ permission for the search space.
"""
try:
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.filter(Podcast.id == podcast_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
podcast = result.scalars().first()
if not podcast:
raise HTTPException(
status_code=404,
detail="Podcast not found or you don't have permission to access it",
detail="Podcast not found",
)
# Check permission for the search space
await check_permission(
session,
user,
podcast.search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to read podcasts in this search space",
)
return podcast
except HTTPException as he:
raise he
@ -112,8 +166,26 @@ async def update_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Update a podcast.
Requires PODCASTS_UPDATE permission for the search space.
"""
try:
db_podcast = await read_podcast(podcast_id, session, user)
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
db_podcast = result.scalars().first()
if not db_podcast:
raise HTTPException(status_code=404, detail="Podcast not found")
# Check permission for the search space
await check_permission(
session,
user,
db_podcast.search_space_id,
Permission.PODCASTS_UPDATE.value,
"You don't have permission to update podcasts in this search space",
)
update_data = podcast_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(db_podcast, key, value)
@ -140,8 +212,26 @@ async def delete_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Delete a podcast.
Requires PODCASTS_DELETE permission for the search space.
"""
try:
db_podcast = await read_podcast(podcast_id, session, user)
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
db_podcast = result.scalars().first()
if not db_podcast:
raise HTTPException(status_code=404, detail="Podcast not found")
# Check permission for the search space
await check_permission(
session,
user,
db_podcast.search_space_id,
Permission.PODCASTS_DELETE.value,
"You don't have permission to delete podcasts in this search space",
)
await session.delete(db_podcast)
await session.commit()
return {"message": "Podcast deleted successfully"}
@ -181,9 +271,19 @@ async def generate_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Generate a podcast from a chat or document.
Requires PODCASTS_CREATE permission.
"""
try:
# Check if the user owns the search space
await check_ownership(session, SearchSpace, request.search_space_id, user)
# Check if the user has permission to create podcasts
await check_permission(
session,
user,
request.search_space_id,
Permission.PODCASTS_CREATE.value,
"You don't have permission to create podcasts in this search space",
)
if request.type == "CHAT":
# Verify that all chat IDs belong to this user and search space
@ -251,22 +351,29 @@ async def stream_podcast(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Stream a podcast audio file."""
"""
Stream a podcast audio file.
Requires PODCASTS_READ permission for the search space.
"""
try:
# Get the podcast and check if user has access
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.filter(Podcast.id == podcast_id, SearchSpace.user_id == user.id)
)
result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id))
podcast = result.scalars().first()
if not podcast:
raise HTTPException(
status_code=404,
detail="Podcast not found or you don't have permission to access it",
detail="Podcast not found",
)
# Check permission for the search space
await check_permission(
session,
user,
podcast.search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to access podcasts in this search space",
)
# Get the file path
file_path = podcast.file_location
@ -303,12 +410,30 @@ async def get_podcast_by_chat_id(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a podcast by its associated chat ID.
Requires PODCASTS_READ permission for the search space.
"""
try:
# Get the podcast and check if user has access
# First get the chat to find its search space
chat_result = await session.execute(select(Chat).filter(Chat.id == chat_id))
chat = chat_result.scalars().first()
if not chat:
return None
# Check permission for the search space
await check_permission(
session,
user,
chat.search_space_id,
Permission.PODCASTS_READ.value,
"You don't have permission to read podcasts in this search space",
)
# Get the podcast
result = await session.execute(
select(Podcast)
.join(SearchSpace)
.filter(Podcast.chat_id == chat_id, SearchSpace.user_id == user.id)
select(Podcast).filter(Podcast.chat_id == chat_id)
)
podcast = result.scalars().first()

File diff suppressed because it is too large Load diff

View file

@ -22,9 +22,9 @@ from sqlalchemy.future import select
from app.connectors.github_connector import GitHubConnector
from app.db import (
Permission,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
User,
async_session_maker,
get_async_session,
@ -39,6 +39,7 @@ from app.tasks.connector_indexers import (
index_airtable_records,
index_clickup_tasks,
index_confluence_pages,
index_crawled_urls,
index_discord_messages,
index_elasticsearch_documents,
index_github_repos,
@ -51,12 +52,12 @@ from app.tasks.connector_indexers import (
index_slack_messages,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.periodic_scheduler import (
create_periodic_schedule,
delete_periodic_schedule,
update_periodic_schedule,
)
from app.utils.rbac import check_permission
# Set up logging
logger = logging.getLogger(__name__)
@ -107,19 +108,25 @@ async def create_search_source_connector(
):
"""
Create a new search source connector.
Requires CONNECTORS_CREATE permission.
Each search space can have only one connector of each type per user (based on search_space_id, user_id, and connector_type).
Each search space can have only one connector of each type (based on search_space_id and connector_type).
The config must contain the appropriate keys for the connector type.
"""
try:
# Check if the search space belongs to the user
await check_ownership(session, SearchSpace, search_space_id, user)
# Check if user has permission to create connectors
await check_permission(
session,
user,
search_space_id,
Permission.CONNECTORS_CREATE.value,
"You don't have permission to create connectors in this search space",
)
# Check if a connector with the same type already exists for this search space and user
# Check if a connector with the same type already exists for this search space
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.user_id == user.id,
SearchSourceConnector.connector_type == connector.connector_type,
)
)
@ -127,7 +134,7 @@ async def create_search_source_connector(
if existing_connector:
raise HTTPException(
status_code=409,
detail=f"A connector with type {connector.connector_type} already exists in this search space. Each search space can have only one connector of each type per user.",
detail=f"A connector with type {connector.connector_type} already exists in this search space.",
)
# Prepare connector data
@ -197,22 +204,34 @@ async def read_search_source_connectors(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""List all search source connectors for the current user, optionally filtered by search space."""
"""
List all search source connectors for a search space.
Requires CONNECTORS_READ permission.
"""
try:
query = select(SearchSourceConnector).filter(
SearchSourceConnector.user_id == user.id
if search_space_id is None:
raise HTTPException(
status_code=400,
detail="search_space_id is required",
)
# Check if user has permission to read connectors
await check_permission(
session,
user,
search_space_id,
Permission.CONNECTORS_READ.value,
"You don't have permission to view connectors in this search space",
)
# Filter by search_space_id if provided
if search_space_id is not None:
# Verify the search space belongs to the user
await check_ownership(session, SearchSpace, search_space_id, user)
query = query.filter(
SearchSourceConnector.search_space_id == search_space_id
)
query = select(SearchSourceConnector).filter(
SearchSourceConnector.search_space_id == search_space_id
)
result = await session.execute(query.offset(skip).limit(limit))
return result.scalars().all()
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500,
@ -228,9 +247,32 @@ async def read_search_source_connector(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Get a specific search source connector by ID."""
"""
Get a specific search source connector by ID.
Requires CONNECTORS_READ permission.
"""
try:
return await check_ownership(session, SearchSourceConnector, connector_id, user)
# Get the connector first
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id
)
)
connector = result.scalars().first()
if not connector:
raise HTTPException(status_code=404, detail="Connector not found")
# Check permission
await check_permission(
session,
user,
connector.search_space_id,
Permission.CONNECTORS_READ.value,
"You don't have permission to view this connector",
)
return connector
except HTTPException:
raise
except Exception as e:
@ -250,10 +292,25 @@ async def update_search_source_connector(
):
"""
Update a search source connector.
Requires CONNECTORS_UPDATE permission.
Handles partial updates, including merging changes into the 'config' field.
"""
db_connector = await check_ownership(
session, SearchSourceConnector, connector_id, user
# Get the connector first
result = await session.execute(
select(SearchSourceConnector).filter(SearchSourceConnector.id == connector_id)
)
db_connector = result.scalars().first()
if not db_connector:
raise HTTPException(status_code=404, detail="Connector not found")
# Check permission
await check_permission(
session,
user,
db_connector.search_space_id,
Permission.CONNECTORS_UPDATE.value,
"You don't have permission to update this connector",
)
# Convert the sparse update data (only fields present in request) to a dict
@ -348,20 +405,19 @@ async def update_search_source_connector(
for key, value in update_data.items():
# Prevent changing connector_type if it causes a duplicate (check moved here)
if key == "connector_type" and value != db_connector.connector_type:
result = await session.execute(
check_result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.search_space_id
== db_connector.search_space_id,
SearchSourceConnector.user_id == user.id,
SearchSourceConnector.connector_type == value,
SearchSourceConnector.id != connector_id,
)
)
existing_connector = result.scalars().first()
existing_connector = check_result.scalars().first()
if existing_connector:
raise HTTPException(
status_code=409,
detail=f"A connector with type {value} already exists in this search space. Each search space can have only one connector of each type per user.",
detail=f"A connector with type {value} already exists in this search space.",
)
setattr(db_connector, key, value)
@ -424,10 +480,29 @@ async def delete_search_source_connector(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Delete a search source connector."""
"""
Delete a search source connector.
Requires CONNECTORS_DELETE permission.
"""
try:
db_connector = await check_ownership(
session, SearchSourceConnector, connector_id, user
# Get the connector first
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id
)
)
db_connector = result.scalars().first()
if not db_connector:
raise HTTPException(status_code=404, detail="Connector not found")
# Check permission
await check_permission(
session,
user,
db_connector.search_space_id,
Permission.CONNECTORS_DELETE.value,
"You don't have permission to delete this connector",
)
# Delete any periodic schedule associated with this connector
@ -472,6 +547,7 @@ async def index_connector_content(
):
"""
Index content from a connector to a search space.
Requires CONNECTORS_UPDATE permission (to trigger indexing).
Currently supports:
- SLACK_CONNECTOR: Indexes messages from all accessible Slack channels
@ -482,24 +558,34 @@ async def index_connector_content(
- DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
- LUMA_CONNECTOR: Indexes events from Luma
- ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch
- WEBCRAWLER_CONNECTOR: Indexes web pages from crawled websites
Args:
connector_id: ID of the connector to use
search_space_id: ID of the search space to store indexed content
background_tasks: FastAPI background tasks
Returns:
Dictionary with indexing status
"""
try:
# Check if the connector belongs to the user
connector = await check_ownership(
session, SearchSourceConnector, connector_id, user
# Get the connector first
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id
)
)
connector = result.scalars().first()
# Check if the search space belongs to the user
_search_space = await check_ownership(
session, SearchSpace, search_space_id, user
if not connector:
raise HTTPException(status_code=404, detail="Connector not found")
# Check if user has permission to update connectors (indexing is an update operation)
await check_permission(
session,
user,
search_space_id,
Permission.CONNECTORS_UPDATE.value,
"You don't have permission to index content in this search space",
)
# Handle different connector types
@ -688,6 +774,17 @@ async def index_connector_content(
)
response_message = "Elasticsearch indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
logger.info(
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
index_crawled_urls_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = "Web page indexing started in the background."
else:
raise HTTPException(
status_code=400,
@ -1523,3 +1620,64 @@ async def run_elasticsearch_indexing(
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
exc_info=True,
)
# Add new helper functions for crawled web page indexing
async def run_web_page_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Web page indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_web_page_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Web page indexing.
Args:
session: Database session
connector_id: ID of the webcrawler connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
try:
documents_processed, error_or_warning = await index_crawled_urls(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
update_last_indexed=False, # Don't update timestamp in the indexing function
)
# Only update last_indexed_at if indexing was successful (either new docs or updated docs)
if documents_processed > 0:
await update_connector_last_indexed(session, connector_id)
logger.info(
f"Web page indexing completed successfully: {documents_processed} documents processed"
)
else:
logger.error(
f"Web page indexing failed or no documents processed: {error_or_warning}"
)
except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}")

View file

@ -1,18 +1,77 @@
import logging
from pathlib import Path
import yaml
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import func
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import SearchSpace, User, get_async_session
from app.schemas import SearchSpaceCreate, SearchSpaceRead, SearchSpaceUpdate
from app.db import (
Permission,
SearchSpace,
SearchSpaceMembership,
SearchSpaceRole,
User,
get_async_session,
get_default_roles_config,
)
from app.schemas import (
SearchSpaceCreate,
SearchSpaceRead,
SearchSpaceUpdate,
SearchSpaceWithStats,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.utils.rbac import check_permission, check_search_space_access
logger = logging.getLogger(__name__)
router = APIRouter()
async def create_default_roles_and_membership(
session: AsyncSession,
search_space_id: int,
owner_user_id,
) -> None:
"""
Create default system roles for a search space and add the owner as a member.
Args:
session: Database session
search_space_id: The ID of the newly created search space
owner_user_id: The UUID of the user who created the search space
"""
# Create default roles
default_roles = get_default_roles_config()
owner_role_id = None
for role_config in default_roles:
db_role = SearchSpaceRole(
name=role_config["name"],
description=role_config["description"],
permissions=role_config["permissions"],
is_default=role_config["is_default"],
is_system_role=role_config["is_system_role"],
search_space_id=search_space_id,
)
session.add(db_role)
await session.flush() # Get the ID
if role_config["name"] == "Owner":
owner_role_id = db_role.id
# Create owner membership
owner_membership = SearchSpaceMembership(
user_id=owner_user_id,
search_space_id=search_space_id,
role_id=owner_role_id,
is_owner=True,
)
session.add(owner_membership)
@router.post("/searchspaces", response_model=SearchSpaceRead)
async def create_search_space(
search_space: SearchSpaceCreate,
@ -27,6 +86,11 @@ async def create_search_space(
db_search_space = SearchSpace(**search_space_data, user_id=user.id)
session.add(db_search_space)
await session.flush() # Get the search space ID
# Create default roles and owner membership
await create_default_roles_and_membership(session, db_search_space.id, user.id)
await session.commit()
await session.refresh(db_search_space)
return db_search_space
@ -34,26 +98,86 @@ async def create_search_space(
raise
except Exception as e:
await session.rollback()
logger.error(f"Failed to create search space: {e!s}", exc_info=True)
raise HTTPException(
status_code=500, detail=f"Failed to create search space: {e!s}"
) from e
@router.get("/searchspaces", response_model=list[SearchSpaceRead])
@router.get("/searchspaces", response_model=list[SearchSpaceWithStats])
async def read_search_spaces(
skip: int = 0,
limit: int = 200,
owned_only: bool = False,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get all search spaces the user has access to, with member count and ownership info.
Args:
skip: Number of items to skip
limit: Maximum number of items to return
owned_only: If True, only return search spaces owned by the user.
If False (default), return all search spaces the user has access to.
"""
try:
result = await session.execute(
select(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.offset(skip)
.limit(limit)
)
return result.scalars().all()
if owned_only:
# Return only search spaces where user is the original creator (user_id)
result = await session.execute(
select(SearchSpace)
.filter(SearchSpace.user_id == user.id)
.offset(skip)
.limit(limit)
)
else:
# Return all search spaces the user has membership in
result = await session.execute(
select(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
.offset(skip)
.limit(limit)
)
search_spaces = result.scalars().all()
# Get member counts and ownership info for each search space
search_spaces_with_stats = []
for space in search_spaces:
# Get member count
count_result = await session.execute(
select(func.count(SearchSpaceMembership.id)).filter(
SearchSpaceMembership.search_space_id == space.id
)
)
member_count = count_result.scalar() or 1
# Check if current user is owner
ownership_result = await session.execute(
select(SearchSpaceMembership).filter(
SearchSpaceMembership.search_space_id == space.id,
SearchSpaceMembership.user_id == user.id,
SearchSpaceMembership.is_owner == True, # noqa: E712
)
)
is_owner = ownership_result.scalars().first() is not None
search_spaces_with_stats.append(
SearchSpaceWithStats(
id=space.id,
name=space.name,
description=space.description,
created_at=space.created_at,
user_id=space.user_id,
citations_enabled=space.citations_enabled,
qna_custom_instructions=space.qna_custom_instructions,
member_count=member_count,
is_owner=is_owner,
)
)
return search_spaces_with_stats
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch search spaces: {e!s}"
@ -97,10 +221,22 @@ async def read_search_space(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Get a specific search space by ID.
Requires SETTINGS_VIEW permission or membership.
"""
try:
search_space = await check_ownership(
session, SearchSpace, search_space_id, user
# Check if user has access (is a member)
await check_search_space_access(session, user, search_space_id)
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
raise HTTPException(status_code=404, detail="Search space not found")
return search_space
except HTTPException:
@ -118,10 +254,28 @@ async def update_search_space(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Update a search space.
Requires SETTINGS_UPDATE permission.
"""
try:
db_search_space = await check_ownership(
session, SearchSpace, search_space_id, user
# Check permission
await check_permission(
session,
user,
search_space_id,
Permission.SETTINGS_UPDATE.value,
"You don't have permission to update this search space",
)
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
db_search_space = result.scalars().first()
if not db_search_space:
raise HTTPException(status_code=404, detail="Search space not found")
update_data = search_space_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(db_search_space, key, value)
@ -143,10 +297,28 @@ async def delete_search_space(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Delete a search space.
Requires SETTINGS_DELETE permission (only owners have this by default).
"""
try:
db_search_space = await check_ownership(
session, SearchSpace, search_space_id, user
# Check permission - only those with SETTINGS_DELETE can delete
await check_permission(
session,
user,
search_space_id,
Permission.SETTINGS_DELETE.value,
"You don't have permission to delete this search space",
)
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
db_search_space = result.scalars().first()
if not db_search_space:
raise HTTPException(status_code=404, detail="Search space not found")
await session.delete(db_search_space)
await session.commit()
return {"message": "Search space deleted successfully"}

View file

@ -27,6 +27,23 @@ from .podcasts import (
PodcastRead,
PodcastUpdate,
)
from .rbac_schemas import (
InviteAcceptRequest,
InviteAcceptResponse,
InviteCreate,
InviteInfoResponse,
InviteRead,
InviteUpdate,
MembershipRead,
MembershipReadWithUser,
MembershipUpdate,
PermissionInfo,
PermissionsListResponse,
RoleCreate,
RoleRead,
RoleUpdate,
UserSearchSpaceAccess,
)
from .search_source_connector import (
SearchSourceConnectorBase,
SearchSourceConnectorCreate,
@ -38,6 +55,7 @@ from .search_space import (
SearchSpaceCreate,
SearchSpaceRead,
SearchSpaceUpdate,
SearchSpaceWithStats,
)
from .users import UserCreate, UserRead, UserUpdate
@ -60,6 +78,13 @@ __all__ = [
"ExtensionDocumentContent",
"ExtensionDocumentMetadata",
"IDModel",
# RBAC schemas
"InviteAcceptRequest",
"InviteAcceptResponse",
"InviteCreate",
"InviteInfoResponse",
"InviteRead",
"InviteUpdate",
"LLMConfigBase",
"LLMConfigCreate",
"LLMConfigRead",
@ -69,12 +94,20 @@ __all__ = [
"LogFilter",
"LogRead",
"LogUpdate",
"MembershipRead",
"MembershipReadWithUser",
"MembershipUpdate",
"PaginatedResponse",
"PermissionInfo",
"PermissionsListResponse",
"PodcastBase",
"PodcastCreate",
"PodcastGenerateRequest",
"PodcastRead",
"PodcastUpdate",
"RoleCreate",
"RoleRead",
"RoleUpdate",
"SearchSourceConnectorBase",
"SearchSourceConnectorCreate",
"SearchSourceConnectorRead",
@ -83,8 +116,10 @@ __all__ = [
"SearchSpaceCreate",
"SearchSpaceRead",
"SearchSpaceUpdate",
"SearchSpaceWithStats",
"TimestampModel",
"UserCreate",
"UserRead",
"UserSearchSpaceAccess",
"UserUpdate",
]

View file

@ -0,0 +1,186 @@
"""
Pydantic schemas for RBAC (Role-Based Access Control) endpoints.
"""
from datetime import datetime
from uuid import UUID
from pydantic import BaseModel, Field
# ============ Role Schemas ============
class RoleBase(BaseModel):
"""Base schema for roles."""
name: str = Field(..., min_length=1, max_length=100)
description: str | None = Field(None, max_length=500)
permissions: list[str] = Field(default_factory=list)
is_default: bool = False
class RoleCreate(RoleBase):
"""Schema for creating a new role."""
pass
class RoleUpdate(BaseModel):
"""Schema for updating a role (partial update)."""
name: str | None = Field(None, min_length=1, max_length=100)
description: str | None = Field(None, max_length=500)
permissions: list[str] | None = None
is_default: bool | None = None
class RoleRead(RoleBase):
"""Schema for reading a role."""
id: int
search_space_id: int
is_system_role: bool
created_at: datetime
class Config:
from_attributes = True
# ============ Membership Schemas ============
class MembershipBase(BaseModel):
"""Base schema for memberships."""
pass
class MembershipUpdate(BaseModel):
"""Schema for updating a membership (change role)."""
role_id: int | None = None
class MembershipRead(BaseModel):
"""Schema for reading a membership."""
id: int
user_id: UUID
search_space_id: int
role_id: int | None
is_owner: bool
joined_at: datetime
created_at: datetime
# Nested role info
role: RoleRead | None = None
# User email (populated separately)
user_email: str | None = None
class Config:
from_attributes = True
class MembershipReadWithUser(MembershipRead):
"""Schema for reading a membership with user details."""
user_email: str | None = None
user_is_active: bool | None = None
# ============ Invite Schemas ============
class InviteBase(BaseModel):
"""Base schema for invites."""
name: str | None = Field(None, max_length=100)
role_id: int | None = None
expires_at: datetime | None = None
max_uses: int | None = Field(None, ge=1)
class InviteCreate(InviteBase):
"""Schema for creating a new invite."""
pass
class InviteUpdate(BaseModel):
"""Schema for updating an invite (partial update)."""
name: str | None = Field(None, max_length=100)
role_id: int | None = None
expires_at: datetime | None = None
max_uses: int | None = Field(None, ge=1)
is_active: bool | None = None
class InviteRead(InviteBase):
"""Schema for reading an invite."""
id: int
invite_code: str
search_space_id: int
created_by_id: UUID | None
uses_count: int
is_active: bool
created_at: datetime
# Nested role info
role: RoleRead | None = None
class Config:
from_attributes = True
class InviteAcceptRequest(BaseModel):
"""Schema for accepting an invite."""
invite_code: str = Field(..., min_length=1)
class InviteAcceptResponse(BaseModel):
"""Response schema for accepting an invite."""
message: str
search_space_id: int
search_space_name: str
role_name: str | None
class InviteInfoResponse(BaseModel):
"""Response schema for getting invite info (public endpoint)."""
search_space_name: str
role_name: str | None
is_valid: bool
message: str | None = None
# ============ Permission Schemas ============
class PermissionInfo(BaseModel):
"""Schema for permission information."""
value: str
name: str
category: str
class PermissionsListResponse(BaseModel):
"""Response schema for listing all available permissions."""
permissions: list[PermissionInfo]
# ============ User Access Info ============
class UserSearchSpaceAccess(BaseModel):
"""Schema for user's access info in a search space."""
search_space_id: int
search_space_name: str
is_owner: bool
role_name: str | None
permissions: list[str]

View file

@ -34,3 +34,10 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel):
qna_custom_instructions: str | None = None
model_config = ConfigDict(from_attributes=True)
class SearchSpaceWithStats(SearchSpaceRead):
"""Extended search space info with member count and ownership status."""
member_count: int = 1
is_owner: bool = False

View file

@ -15,18 +15,17 @@ from app.db import (
Document,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
)
from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
class ConnectorService:
def __init__(self, session: AsyncSession, user_id: str | None = None):
def __init__(self, session: AsyncSession, search_space_id: int | None = None):
self.session = session
self.chunk_retriever = ChucksHybridSearchRetriever(session)
self.document_retriever = DocumentHybridSearchRetriever(session)
self.user_id = user_id
self.search_space_id = search_space_id
self.source_id_counter = (
100000 # High starting value to avoid collisions with existing IDs
)
@ -36,23 +35,22 @@ class ConnectorService:
async def initialize_counter(self):
"""
Initialize the source_id_counter based on the total number of chunks for the user.
Initialize the source_id_counter based on the total number of chunks for the search space.
This ensures unique IDs across different sessions.
"""
if self.user_id:
if self.search_space_id:
try:
# Count total chunks for documents belonging to this user
# Count total chunks for documents belonging to this search space
result = await self.session.execute(
select(func.count(Chunk.id))
.join(Document)
.join(SearchSpace)
.filter(SearchSpace.user_id == self.user_id)
.filter(Document.search_space_id == self.search_space_id)
)
chunk_count = result.scalar() or 0
self.source_id_counter = chunk_count + 1
print(
f"Initialized source_id_counter to {self.source_id_counter} for user {self.user_id}"
f"Initialized source_id_counter to {self.source_id_counter} for search space {self.search_space_id}"
)
except Exception as e:
print(f"Error initializing source_id_counter: {e!s}")
@ -62,7 +60,6 @@ class ConnectorService:
async def search_crawled_urls(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -70,6 +67,12 @@ class ConnectorService:
"""
Search for crawled URLs and return both the source information and langchain documents
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
Returns:
tuple: (sources_info, langchain_documents)
"""
@ -77,7 +80,6 @@ class ConnectorService:
crawled_urls_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CRAWLED_URL",
)
@ -85,7 +87,6 @@ class ConnectorService:
crawled_urls_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CRAWLED_URL",
)
@ -109,15 +110,43 @@ class ConnectorService:
document = chunk.get("document", {})
metadata = document.get("metadata", {})
# Create a source entry
# Extract webcrawler-specific metadata
url = metadata.get("source", metadata.get("url", ""))
title = document.get(
"title", metadata.get("title", "Untitled Document")
)
description = metadata.get("description", "")
language = metadata.get("language", "")
last_crawled_at = metadata.get("last_crawled_at", "")
# Build description with crawler info
content_preview = chunk.get("content", "")
if not description and content_preview:
# Use content preview if no description
description = content_preview[:200]
if len(content_preview) > 200:
description += "..."
# Add crawler metadata to description if available
info_parts = []
if language:
info_parts.append(f"Language: {language}")
if last_crawled_at:
info_parts.append(f"Last crawled: {last_crawled_at}")
if info_parts:
if description:
description += f" | {' | '.join(info_parts)}"
else:
description = " | ".join(info_parts)
source = {
"id": chunk.get("chunk_id", self.source_id_counter),
"title": document.get("title", "Untitled Document"),
"description": metadata.get(
"og:description",
metadata.get("ogDescription", chunk.get("content", "")),
),
"url": metadata.get("url", ""),
"title": title,
"description": description,
"url": url,
"language": language,
"last_crawled_at": last_crawled_at,
}
self.source_id_counter += 1
@ -136,7 +165,6 @@ class ConnectorService:
async def search_files(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -151,7 +179,6 @@ class ConnectorService:
files_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="FILE",
)
@ -159,7 +186,6 @@ class ConnectorService:
files_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="FILE",
)
@ -239,43 +265,35 @@ class ConnectorService:
async def get_connector_by_type(
self,
user_id: str,
connector_type: SearchSourceConnectorType,
search_space_id: int | None = None,
search_space_id: int,
) -> SearchSourceConnector | None:
"""
Get a connector by type for a specific user and optionally a search space
Get a connector by type for a specific search space
Args:
user_id: The user's ID
connector_type: The connector type to retrieve
search_space_id: Optional search space ID to filter by
search_space_id: The search space ID to filter by
Returns:
Optional[SearchSourceConnector]: The connector if found, None otherwise
"""
query = select(SearchSourceConnector).filter(
SearchSourceConnector.user_id == user_id,
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type == connector_type,
)
if search_space_id is not None:
query = query.filter(
SearchSourceConnector.search_space_id == search_space_id
)
result = await self.session.execute(query)
return result.scalars().first()
async def search_tavily(
self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20
self, user_query: str, search_space_id: int, top_k: int = 20
) -> tuple:
"""
Search using Tavily API and return both the source information and documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID
top_k: Maximum number of results to return
@ -284,7 +302,7 @@ class ConnectorService:
"""
# Get Tavily connector configuration
tavily_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.TAVILY_API, search_space_id
SearchSourceConnectorType.TAVILY_API, search_space_id
)
if not tavily_connector:
@ -377,7 +395,6 @@ class ConnectorService:
async def search_searxng(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
) -> tuple:
@ -385,7 +402,7 @@ class ConnectorService:
Search using a configured SearxNG instance and return both sources and documents.
"""
searx_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.SEARXNG_API, search_space_id
SearchSourceConnectorType.SEARXNG_API, search_space_id
)
if not searx_connector:
@ -563,7 +580,6 @@ class ConnectorService:
async def search_baidu(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
) -> tuple:
@ -575,7 +591,6 @@ class ConnectorService:
Args:
user_query: User's search query
user_id: User ID
search_space_id: Search space ID
top_k: Maximum number of results to return
@ -584,7 +599,7 @@ class ConnectorService:
"""
# Get Baidu connector configuration
baidu_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.BAIDU_SEARCH_API, search_space_id
SearchSourceConnectorType.BAIDU_SEARCH_API, search_space_id
)
if not baidu_connector:
@ -789,7 +804,6 @@ class ConnectorService:
async def search_slack(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -804,7 +818,6 @@ class ConnectorService:
slack_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="SLACK_CONNECTOR",
)
@ -812,7 +825,6 @@ class ConnectorService:
slack_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="SLACK_CONNECTOR",
)
@ -877,7 +889,6 @@ class ConnectorService:
async def search_notion(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -887,7 +898,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
@ -898,7 +908,6 @@ class ConnectorService:
notion_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="NOTION_CONNECTOR",
)
@ -906,7 +915,6 @@ class ConnectorService:
notion_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="NOTION_CONNECTOR",
)
@ -974,7 +982,6 @@ class ConnectorService:
async def search_extension(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -984,7 +991,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
@ -995,7 +1001,6 @@ class ConnectorService:
extension_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="EXTENSION",
)
@ -1003,7 +1008,6 @@ class ConnectorService:
extension_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="EXTENSION",
)
@ -1095,7 +1099,6 @@ class ConnectorService:
async def search_youtube(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1105,7 +1108,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
@ -1116,7 +1118,6 @@ class ConnectorService:
youtube_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO",
)
@ -1124,7 +1125,6 @@ class ConnectorService:
youtube_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO",
)
@ -1192,7 +1192,6 @@ class ConnectorService:
async def search_github(
self,
user_query: str,
user_id: int,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1207,7 +1206,6 @@ class ConnectorService:
github_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR",
)
@ -1215,7 +1213,6 @@ class ConnectorService:
github_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR",
)
@ -1267,7 +1264,6 @@ class ConnectorService:
async def search_linear(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1277,7 +1273,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
@ -1288,7 +1283,6 @@ class ConnectorService:
linear_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR",
)
@ -1296,7 +1290,6 @@ class ConnectorService:
linear_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR",
)
@ -1376,7 +1369,6 @@ class ConnectorService:
async def search_jira(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1386,7 +1378,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1398,7 +1389,6 @@ class ConnectorService:
jira_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="JIRA_CONNECTOR",
)
@ -1406,7 +1396,6 @@ class ConnectorService:
jira_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="JIRA_CONNECTOR",
)
@ -1497,7 +1486,6 @@ class ConnectorService:
async def search_google_calendar(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1507,7 +1495,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1519,7 +1506,6 @@ class ConnectorService:
calendar_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GOOGLE_CALENDAR_CONNECTOR",
)
@ -1527,7 +1513,6 @@ class ConnectorService:
calendar_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GOOGLE_CALENDAR_CONNECTOR",
)
@ -1630,7 +1615,6 @@ class ConnectorService:
async def search_airtable(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1640,7 +1624,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1652,7 +1635,6 @@ class ConnectorService:
airtable_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="AIRTABLE_CONNECTOR",
)
@ -1660,7 +1642,6 @@ class ConnectorService:
airtable_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="AIRTABLE_CONNECTOR",
)
@ -1718,7 +1699,6 @@ class ConnectorService:
async def search_google_gmail(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1728,7 +1708,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1740,7 +1719,6 @@ class ConnectorService:
gmail_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GOOGLE_GMAIL_CONNECTOR",
)
@ -1748,7 +1726,6 @@ class ConnectorService:
gmail_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="GOOGLE_GMAIL_CONNECTOR",
)
@ -1842,7 +1819,6 @@ class ConnectorService:
async def search_confluence(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1852,7 +1828,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1864,7 +1839,6 @@ class ConnectorService:
confluence_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CONFLUENCE_CONNECTOR",
)
@ -1872,7 +1846,6 @@ class ConnectorService:
confluence_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CONFLUENCE_CONNECTOR",
)
@ -1937,7 +1910,6 @@ class ConnectorService:
async def search_clickup(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -1947,7 +1919,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -1959,7 +1930,6 @@ class ConnectorService:
clickup_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CLICKUP_CONNECTOR",
)
@ -1967,7 +1937,6 @@ class ConnectorService:
clickup_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="CLICKUP_CONNECTOR",
)
@ -2053,7 +2022,6 @@ class ConnectorService:
async def search_linkup(
self,
user_query: str,
user_id: str,
search_space_id: int,
mode: str = "standard",
) -> tuple:
@ -2062,7 +2030,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID
mode: Search depth mode, can be "standard" or "deep"
@ -2071,7 +2038,7 @@ class ConnectorService:
"""
# Get Linkup connector configuration
linkup_connector = await self.get_connector_by_type(
user_id, SearchSourceConnectorType.LINKUP_API, search_space_id
SearchSourceConnectorType.LINKUP_API, search_space_id
)
if not linkup_connector:
@ -2176,7 +2143,6 @@ class ConnectorService:
async def search_discord(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -2186,7 +2152,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
@ -2197,7 +2162,6 @@ class ConnectorService:
discord_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR",
)
@ -2205,7 +2169,6 @@ class ConnectorService:
discord_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR",
)
@ -2273,7 +2236,6 @@ class ConnectorService:
async def search_luma(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -2283,7 +2245,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -2295,7 +2256,6 @@ class ConnectorService:
luma_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="LUMA_CONNECTOR",
)
@ -2303,7 +2263,6 @@ class ConnectorService:
luma_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="LUMA_CONNECTOR",
)
@ -2431,7 +2390,6 @@ class ConnectorService:
async def search_elasticsearch(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
@ -2441,7 +2399,6 @@ class ConnectorService:
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
@ -2453,7 +2410,6 @@ class ConnectorService:
elasticsearch_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
)
@ -2461,7 +2417,6 @@ class ConnectorService:
elasticsearch_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
)

View file

@ -7,7 +7,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config
from app.db import LLMConfig, UserSearchSpacePreference
from app.db import LLMConfig, SearchSpace
# Configure litellm to automatically drop unsupported parameters
litellm.drop_params = True
@ -144,15 +144,16 @@ async def validate_llm_config(
return False, error_msg
async def get_user_llm_instance(
session: AsyncSession, user_id: str, search_space_id: int, role: str
async def get_search_space_llm_instance(
session: AsyncSession, search_space_id: int, role: str
) -> ChatLiteLLM | None:
"""
Get a ChatLiteLLM instance for a specific user, search space, and role.
Get a ChatLiteLLM instance for a specific search space and role.
LLM preferences are stored at the search space level and shared by all members.
Args:
session: Database session
user_id: User ID
search_space_id: Search Space ID
role: LLM role ('long_context', 'fast', or 'strategic')
@ -160,37 +161,30 @@ async def get_user_llm_instance(
ChatLiteLLM instance or None if not found
"""
try:
# Get user's LLM preferences for this search space
# Get the search space with its LLM preferences
result = await session.execute(
select(UserSearchSpacePreference).where(
UserSearchSpacePreference.user_id == user_id,
UserSearchSpacePreference.search_space_id == search_space_id,
)
select(SearchSpace).where(SearchSpace.id == search_space_id)
)
preference = result.scalars().first()
search_space = result.scalars().first()
if not preference:
logger.error(
f"No LLM preferences found for user {user_id} in search space {search_space_id}"
)
if not search_space:
logger.error(f"Search space {search_space_id} not found")
return None
# Get the appropriate LLM config ID based on role
llm_config_id = None
if role == LLMRole.LONG_CONTEXT:
llm_config_id = preference.long_context_llm_id
llm_config_id = search_space.long_context_llm_id
elif role == LLMRole.FAST:
llm_config_id = preference.fast_llm_id
llm_config_id = search_space.fast_llm_id
elif role == LLMRole.STRATEGIC:
llm_config_id = preference.strategic_llm_id
llm_config_id = search_space.strategic_llm_id
else:
logger.error(f"Invalid LLM role: {role}")
return None
if not llm_config_id:
logger.error(
f"No {role} LLM configured for user {user_id} in search space {search_space_id}"
)
logger.error(f"No {role} LLM configured for search space {search_space_id}")
return None
# Check if this is a global config (negative ID)
@ -331,31 +325,63 @@ async def get_user_llm_instance(
except Exception as e:
logger.error(
f"Error getting LLM instance for user {user_id}, role {role}: {e!s}"
f"Error getting LLM instance for search space {search_space_id}, role {role}: {e!s}"
)
return None
async def get_long_context_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | None:
"""Get the search space's long context LLM instance."""
return await get_search_space_llm_instance(
session, search_space_id, LLMRole.LONG_CONTEXT
)
async def get_fast_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | None:
"""Get the search space's fast LLM instance."""
return await get_search_space_llm_instance(session, search_space_id, LLMRole.FAST)
async def get_strategic_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | None:
"""Get the search space's strategic LLM instance."""
return await get_search_space_llm_instance(
session, search_space_id, LLMRole.STRATEGIC
)
# Backward-compatible aliases (deprecated - will be removed in future versions)
async def get_user_llm_instance(
session: AsyncSession, user_id: str, search_space_id: int, role: str
) -> ChatLiteLLM | None:
"""
Deprecated: Use get_search_space_llm_instance instead.
LLM preferences are now stored at the search space level, not per-user.
"""
return await get_search_space_llm_instance(session, search_space_id, role)
async def get_user_long_context_llm(
session: AsyncSession, user_id: str, search_space_id: int
) -> ChatLiteLLM | None:
"""Get user's long context LLM instance for a specific search space."""
return await get_user_llm_instance(
session, user_id, search_space_id, LLMRole.LONG_CONTEXT
)
"""Deprecated: Use get_long_context_llm instead."""
return await get_long_context_llm(session, search_space_id)
async def get_user_fast_llm(
session: AsyncSession, user_id: str, search_space_id: int
) -> ChatLiteLLM | None:
"""Get user's fast LLM instance for a specific search space."""
return await get_user_llm_instance(session, user_id, search_space_id, LLMRole.FAST)
"""Deprecated: Use get_fast_llm instead."""
return await get_fast_llm(session, search_space_id)
async def get_user_strategic_llm(
session: AsyncSession, user_id: str, search_space_id: int
) -> ChatLiteLLM | None:
"""Get user's strategic LLM instance for a specific search space."""
return await get_user_llm_instance(
session, user_id, search_space_id, LLMRole.STRATEGIC
)
"""Deprecated: Use get_strategic_llm instead."""
return await get_strategic_llm(session, search_space_id)

View file

@ -4,7 +4,7 @@ from typing import Any
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from sqlalchemy.ext.asyncio import AsyncSession
from app.services.llm_service import get_user_strategic_llm
from app.services.llm_service import get_strategic_llm
class QueryService:
@ -16,19 +16,17 @@ class QueryService:
async def reformulate_query_with_chat_history(
user_query: str,
session: AsyncSession,
user_id: str,
search_space_id: int,
chat_history_str: str | None = None,
) -> str:
"""
Reformulate the user query using the user's strategic LLM to make it more
Reformulate the user query using the search space's strategic LLM to make it more
effective for information retrieval and research purposes.
Args:
user_query: The original user query
session: Database session for accessing user LLM configs
user_id: User ID to get their specific LLM configuration
search_space_id: Search Space ID to get user's LLM preferences
session: Database session for accessing LLM configs
search_space_id: Search Space ID to get LLM preferences
chat_history_str: Optional chat history string
Returns:
@ -38,11 +36,11 @@ class QueryService:
return user_query
try:
# Get the user's strategic LLM instance
llm = await get_user_strategic_llm(session, user_id, search_space_id)
# Get the search space's strategic LLM instance
llm = await get_strategic_llm(session, search_space_id)
if not llm:
print(
f"Warning: No strategic LLM configured for user {user_id} in search space {search_space_id}. Using original query."
f"Warning: No strategic LLM configured for search space {search_space_id}. Using original query."
)
return user_query

View file

@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
await run_elasticsearch_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_crawled_urls", bind=True)
def index_crawled_urls_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Web page Urls."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_crawled_urls(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_crawled_urls(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Web page Urls with new session."""
from app.routes.search_source_connectors_routes import (
run_web_page_indexing,
)
async with get_celery_session_maker()() as session:
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -9,7 +9,6 @@ from app.celery_app import celery_app
from app.config import config
from app.services.task_logging_service import TaskLoggingService
from app.tasks.document_processors import (
add_crawled_url_document,
add_extension_received_document,
add_youtube_video_document,
)
@ -120,71 +119,6 @@ async def _process_extension_document(
raise
@celery_app.task(name="process_crawled_url", bind=True)
def process_crawled_url_task(self, url: str, search_space_id: int, user_id: str):
"""
Celery task to process crawled URL.
Args:
url: URL to crawl and process
search_space_id: ID of the search space
user_id: ID of the user
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_process_crawled_url(url, search_space_id, user_id))
finally:
loop.close()
async def _process_crawled_url(url: str, search_space_id: int, user_id: str):
"""Process crawled URL with new session."""
async with get_celery_session_maker()() as session:
task_logger = TaskLoggingService(session, search_space_id)
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Error processing crawled URL: {e!s}")
raise
@celery_app.task(name="process_youtube_video", bind=True)
def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str):
"""

View file

@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
index_airtable_records_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
index_elasticsearch_documents_task,
index_github_repos_task,
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
}
# Trigger indexing for each due connector

View file

@ -17,6 +17,7 @@ Available indexers:
- Google Gmail: Index messages from Google Gmail
- Google Calendar: Index events from Google Calendar
- Luma: Index events from Luma
- Webcrawler: Index crawled URLs
- Elasticsearch: Index documents from Elasticsearch instances
"""
@ -41,6 +42,7 @@ from .luma_indexer import index_luma_events
# Documentation and knowledge management
from .notion_indexer import index_notion_pages
from .slack_indexer import index_slack_messages
from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022
"index_airtable_records",
@ -58,6 +60,7 @@ __all__ = [ # noqa: RUF022
"index_linear_issues",
# Documentation and knowledge management
"index_notion_pages",
"index_crawled_urls",
# Communication platforms
"index_slack_messages",
"index_google_gmail_messages",

View file

@ -0,0 +1,450 @@
"""
Webcrawler connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from .base import (
check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_crawled_urls(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index web page URLs.
Args:
session: Database session
connector_id: ID of the webcrawler connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="crawled_url_indexing",
source="connector_indexing_task",
message=f"Starting web page URL indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving webcrawler connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a webcrawler connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a webcrawler connector",
)
# Get the Firecrawl API key from the connector config (optional)
api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config
initial_urls = connector.config.get("INITIAL_URLS", "")
if isinstance(initial_urls, str):
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
elif isinstance(initial_urls, list):
urls = [url.strip() for url in initial_urls if url.strip()]
else:
urls = []
logger.info(
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
)
# Initialize webcrawler client
await task_logger.log_task_progress(
log_entry,
f"Initializing webcrawler client for connector {connector_id}",
{
"stage": "client_initialization",
"use_firecrawl": bool(api_key),
},
)
crawler = WebCrawlerConnector(firecrawl_api_key=api_key)
# Validate URLs
if not urls:
await task_logger.log_task_failure(
log_entry,
"No URLs provided for indexing",
"Empty URL list",
{"error_type": "ValidationError"},
)
return 0, "No URLs provided for indexing"
await task_logger.log_task_progress(
log_entry,
f"Starting to crawl {len(urls)} URLs",
{
"stage": "crawling",
"total_urls": len(urls),
},
)
documents_indexed = 0
documents_updated = 0
documents_skipped = 0
failed_urls = []
for idx, url in enumerate(urls, 1):
try:
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
await task_logger.log_task_progress(
log_entry,
f"Crawling URL {idx}/{len(urls)}: {url}",
{
"stage": "crawling_url",
"url_index": idx,
"url": url,
},
)
# Crawl the URL
crawl_result, error = await crawler.crawl_url(url)
if error or not crawl_result:
logger.warning(f"Failed to crawl URL {url}: {error}")
failed_urls.append((url, error or "Unknown error"))
continue
# Extract content and metadata
content = crawl_result.get("content", "")
metadata = crawl_result.get("metadata", {})
crawler_type = crawl_result.get("crawler_type", "unknown")
if not content.strip():
logger.warning(f"Skipping URL with no content: {url}")
failed_urls.append((url, "No content extracted"))
documents_skipped += 1
continue
# Format content as structured document
structured_document = crawler.format_to_structured_document(
crawl_result
)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Generate content hash
# TODO: To fix this by not including dynamic content like date, time, etc.
content_hash = generate_content_hash(
structured_document, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Extract useful metadata
title = metadata.get("title", url)
description = metadata.get("description", "")
language = metadata.get("language", "")
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(f"Document for URL {url} unchanged. Skipping.")
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for URL {url}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Crawled URL: {title}\n\n"
summary_content += f"URL: {url}\n"
if description:
summary_content += f"Description: {description}\n"
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
content_preview += "..."
summary_content += f"Content Preview:\n{content_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(content)
# Update existing document
existing_document.title = title
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
**metadata,
"crawler_type": crawler_type,
"last_crawled_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
existing_document.chunks = chunks
documents_updated += 1
logger.info(f"Successfully updated URL {url}")
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Crawled URL: {title}\n\n"
summary_content += f"URL: {url}\n"
if description:
summary_content += f"Description: {description}\n"
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
content_preview += "..."
summary_content += f"Content Preview:\n{content_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(content)
document = Document(
search_space_id=search_space_id,
title=title,
document_type=DocumentType.CRAWLED_URL,
document_metadata={
**metadata,
"crawler_type": crawler_type,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new URL {url}")
# Batch commit every 10 documents
if (documents_indexed + documents_updated) % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed + documents_updated} URLs processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing URL {url}: {e!s}",
exc_info=True,
)
failed_urls.append((url, str(e)))
continue
total_processed = documents_indexed + documents_updated
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed"
)
await session.commit()
# Build result message
result_message = None
if failed_urls:
failed_summary = "; ".join(
[f"{url}: {error}" for url, error in failed_urls[:5]]
)
if len(failed_urls) > 5:
failed_summary += f" (and {len(failed_urls) - 5} more)"
result_message = (
f"Completed with {len(failed_urls)} failures: {failed_summary}"
)
await task_logger.log_task_success(
log_entry,
f"Successfully completed crawled web page indexing for connector {connector_id}",
{
"urls_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_updated": documents_updated,
"documents_skipped": documents_skipped,
"failed_urls_count": len(failed_urls),
},
)
logger.info(
f"Web page indexing completed: {documents_indexed} new, "
f"{documents_updated} updated, {documents_skipped} skipped, "
f"{len(failed_urls)} failed"
)
return total_processed, result_message
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during web page indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index web page URLs for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True)
return 0, f"Failed to index web page URLs: {e!s}"
async def get_crawled_url_documents(
session: AsyncSession,
search_space_id: int,
connector_id: int | None = None,
) -> list[Document]:
"""
Get all crawled URL documents for a search space.
Args:
session: Database session
search_space_id: ID of the search space
connector_id: Optional connector ID to filter by
Returns:
List of Document objects
"""
from sqlalchemy import select
query = select(Document).filter(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.CRAWLED_URL,
)
if connector_id:
# Filter by connector if needed - you might need to add a connector_id field to Document
# or filter by some other means depending on your schema
pass
result = await session.execute(query)
documents = result.scalars().all()
return list(documents)

View file

@ -6,7 +6,6 @@ and sources. Each processor is responsible for handling a specific type of docum
processing task in the background.
Available processors:
- URL crawler: Process web pages from URLs
- Extension processor: Handle documents from browser extension
- Markdown processor: Process markdown files
- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
@ -26,14 +25,11 @@ from .file_processors import (
# Markdown processor
from .markdown_processor import add_received_markdown_file_document
from .url_crawler import add_crawled_url_document
# YouTube processor
from .youtube_processor import add_youtube_video_document
__all__ = [
# URL processing
"add_crawled_url_document",
# Extension processing
"add_extension_received_document",
"add_received_file_document_using_docling",

View file

@ -1,342 +0,0 @@
"""
URL crawler document processor.
"""
import logging
import validators
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document as LangchainDocument
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from .base import (
check_document_by_unique_identifier,
md,
)
async def add_crawled_url_document(
session: AsyncSession, url: str, search_space_id: int, user_id: str
) -> Document | None:
"""
Process and store a document from a crawled URL.
Args:
session: Database session
url: URL to crawl
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="crawl_url_document",
source="background_task",
message=f"Starting URL crawling process for: {url}",
metadata={"url": url, "user_id": str(user_id)},
)
try:
# URL validation step
await task_logger.log_task_progress(
log_entry, f"Validating URL: {url}", {"stage": "validation"}
)
if not validators.url(url):
raise ValueError(f"Url {url} is not a valid URL address")
# Set up crawler
await task_logger.log_task_progress(
log_entry,
f"Setting up crawler for URL: {url}",
{
"stage": "crawler_setup",
"firecrawl_available": bool(config.FIRECRAWL_API_KEY),
},
)
use_firecrawl = bool(config.FIRECRAWL_API_KEY)
if use_firecrawl:
# Use Firecrawl SDK directly
firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
else:
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
# Perform crawling
await task_logger.log_task_progress(
log_entry,
f"Crawling URL content: {url}",
{
"stage": "crawling",
"crawler_type": "AsyncFirecrawlApp"
if use_firecrawl
else "AsyncChromiumLoader",
},
)
if use_firecrawl:
# Use async Firecrawl SDK with v1 API - properly awaited
scrape_result = await firecrawl_app.scrape_url(
url=url, formats=["markdown"]
)
# scrape_result is a Pydantic ScrapeResponse object
# Access attributes directly
if scrape_result and scrape_result.success:
# Extract markdown content
markdown_content = scrape_result.markdown or ""
# Extract metadata - this is a DICT
metadata = scrape_result.metadata if scrape_result.metadata else {}
# Convert to LangChain Document format
url_crawled = [
LangchainDocument(
page_content=markdown_content,
metadata={
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("sourceURL", url),
**metadata, # Include all other metadata fields
},
)
]
content_in_markdown = url_crawled[0].page_content
else:
error_msg = (
scrape_result.error
if scrape_result and hasattr(scrape_result, "error")
else "Unknown error"
)
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
else:
# Use AsyncChromiumLoader as fallback
url_crawled = await crawl_loader.aload()
content_in_markdown = md.transform_documents(url_crawled)[0].page_content
# Format document
await task_logger.log_task_progress(
log_entry,
f"Processing crawled content from: {url}",
{"stage": "content_processing", "content_length": len(content_in_markdown)},
)
# Format document metadata in a more maintainable way
metadata_sections = [
(
"METADATA",
[
f"{key.upper()}: {value}"
for key, value in url_crawled[0].metadata.items()
],
),
(
"CONTENT",
["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"],
),
]
# Build the document string more efficiently
document_parts = []
document_parts.append("<DOCUMENT>")
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Generate content hash
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check if document with this unique identifier already exists
await task_logger.log_task_progress(
log_entry,
f"Checking for existing URL: {url}",
{"stage": "duplicate_check", "url": url},
)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
await task_logger.log_task_success(
log_entry,
f"URL document unchanged: {url}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(f"Document for URL {url} unchanged. Skipping.")
return existing_document
else:
# Content has changed - update the existing document
logging.info(f"Content changed for URL {url}. Updating document.")
await task_logger.log_task_progress(
log_entry,
f"Updating URL document: {url}",
{"stage": "document_update", "url": url},
)
# Get LLM for summary generation (needed for both create and update)
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {url}",
{"stage": "llm_setup"},
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
)
# Generate summary
await task_logger.log_task_progress(
log_entry,
f"Generating summary for URL content: {url}",
{"stage": "summary_generation"},
)
# Generate summary with metadata
document_metadata = {
"url": url,
"title": url_crawled[0].metadata.get("title", url),
"document_type": "Crawled URL Document",
"crawler_type": "FirecrawlApp" if use_firecrawl else "AsyncChromiumLoader",
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
# Process chunks
await task_logger.log_task_progress(
log_entry,
f"Processing content chunks for URL: {url}",
{"stage": "chunk_processing"},
)
from app.utils.blocknote_converter import convert_markdown_to_blocknote
# Convert markdown to BlockNote JSON
blocknote_json = await convert_markdown_to_blocknote(combined_document_string)
if not blocknote_json:
logging.warning(
f"Failed to convert crawled URL '{url}' to BlockNote JSON, "
"document will not be editable"
)
chunks = await create_document_chunks(content_in_markdown)
# Update or create document
if existing_document:
# Update existing document
await task_logger.log_task_progress(
log_entry,
f"Updating document in database for URL: {url}",
{"stage": "document_update", "chunks_count": len(chunks)},
)
existing_document.title = url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = url_crawled[0].metadata
existing_document.chunks = chunks
existing_document.blocknote_document = blocknote_json
document = existing_document
else:
# Create new document
await task_logger.log_task_progress(
log_entry,
f"Creating document in database for URL: {url}",
{"stage": "document_creation", "chunks_count": len(chunks)},
)
document = Document(
search_space_id=search_space_id,
title=url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
),
document_type=DocumentType.CRAWLED_URL,
document_metadata=url_crawled[0].metadata,
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": document.id,
"title": document.title,
"content_hash": content_hash,
"chunks_count": len(chunks),
"summary_length": len(summary_content),
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error while processing URL: {url}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
raise RuntimeError(f"Failed to crawl URL: {e!s}") from e

View file

@ -1,19 +0,0 @@
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import User
# Helper function to check user ownership
async def check_ownership(session: AsyncSession, model, item_id: int, user: User):
item = await session.execute(
select(model).filter(model.id == item_id, model.user_id == user.id)
)
item = item.scalars().first()
if not item:
raise HTTPException(
status_code=404,
detail="Item not found or you don't have permission to access it",
)
return item

View file

@ -31,6 +31,7 @@ CONNECTOR_TASK_MAP = {
SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
}
@ -69,6 +70,7 @@ def create_periodic_schedule(
index_airtable_records_task,
index_clickup_tasks_task,
index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task,
index_elasticsearch_documents_task,
index_github_repos_task,
@ -96,6 +98,7 @@ def create_periodic_schedule(
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
}
# Trigger the first run immediately

View file

@ -0,0 +1,274 @@
"""
RBAC (Role-Based Access Control) utility functions.
Provides helpers for checking user permissions in search spaces.
"""
import secrets
from uuid import UUID
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.db import (
Permission,
SearchSpace,
SearchSpaceMembership,
SearchSpaceRole,
User,
has_permission,
)
async def get_user_membership(
session: AsyncSession,
user_id: UUID,
search_space_id: int,
) -> SearchSpaceMembership | None:
"""
Get the user's membership in a search space.
Args:
session: Database session
user_id: User UUID
search_space_id: Search space ID
Returns:
SearchSpaceMembership if found, None otherwise
"""
result = await session.execute(
select(SearchSpaceMembership)
.options(selectinload(SearchSpaceMembership.role))
.filter(
SearchSpaceMembership.user_id == user_id,
SearchSpaceMembership.search_space_id == search_space_id,
)
)
return result.scalars().first()
async def get_user_permissions(
session: AsyncSession,
user_id: UUID,
search_space_id: int,
) -> list[str]:
"""
Get the user's permissions in a search space.
Args:
session: Database session
user_id: User UUID
search_space_id: Search space ID
Returns:
List of permission strings
"""
membership = await get_user_membership(session, user_id, search_space_id)
if not membership:
return []
# Owners always have full access
if membership.is_owner:
return [Permission.FULL_ACCESS.value]
# Get permissions from role
if membership.role:
return membership.role.permissions or []
return []
async def check_permission(
session: AsyncSession,
user: User,
search_space_id: int,
required_permission: str,
error_message: str = "You don't have permission to perform this action",
) -> SearchSpaceMembership:
"""
Check if a user has a specific permission in a search space.
Raises HTTPException if permission is denied.
Args:
session: Database session
user: User object
search_space_id: Search space ID
required_permission: Permission string to check
error_message: Custom error message for permission denied
Returns:
SearchSpaceMembership if permission granted
Raises:
HTTPException: If user doesn't have access or permission
"""
membership = await get_user_membership(session, user.id, search_space_id)
if not membership:
raise HTTPException(
status_code=403,
detail="You don't have access to this search space",
)
# Get user's permissions
if membership.is_owner:
permissions = [Permission.FULL_ACCESS.value]
elif membership.role:
permissions = membership.role.permissions or []
else:
permissions = []
if not has_permission(permissions, required_permission):
raise HTTPException(status_code=403, detail=error_message)
return membership
async def check_search_space_access(
session: AsyncSession,
user: User,
search_space_id: int,
) -> SearchSpaceMembership:
"""
Check if a user has any access to a search space.
This is used for basic access control (user is a member).
Args:
session: Database session
user: User object
search_space_id: Search space ID
Returns:
SearchSpaceMembership if user has access
Raises:
HTTPException: If user doesn't have access
"""
membership = await get_user_membership(session, user.id, search_space_id)
if not membership:
raise HTTPException(
status_code=403,
detail="You don't have access to this search space",
)
return membership
async def is_search_space_owner(
session: AsyncSession,
user_id: UUID,
search_space_id: int,
) -> bool:
"""
Check if a user is the owner of a search space.
Args:
session: Database session
user_id: User UUID
search_space_id: Search space ID
Returns:
True if user is the owner, False otherwise
"""
membership = await get_user_membership(session, user_id, search_space_id)
return membership is not None and membership.is_owner
async def get_search_space_with_access_check(
session: AsyncSession,
user: User,
search_space_id: int,
required_permission: str | None = None,
) -> tuple[SearchSpace, SearchSpaceMembership]:
"""
Get a search space with access and optional permission check.
Args:
session: Database session
user: User object
search_space_id: Search space ID
required_permission: Optional permission to check
Returns:
Tuple of (SearchSpace, SearchSpaceMembership)
Raises:
HTTPException: If search space not found or user lacks access/permission
"""
# Get the search space
result = await session.execute(
select(SearchSpace).filter(SearchSpace.id == search_space_id)
)
search_space = result.scalars().first()
if not search_space:
raise HTTPException(status_code=404, detail="Search space not found")
# Check access
if required_permission:
membership = await check_permission(
session, user, search_space_id, required_permission
)
else:
membership = await check_search_space_access(session, user, search_space_id)
return search_space, membership
def generate_invite_code() -> str:
"""
Generate a unique invite code for search space invites.
Returns:
A 32-character URL-safe invite code
"""
return secrets.token_urlsafe(24)
async def get_default_role(
session: AsyncSession,
search_space_id: int,
) -> SearchSpaceRole | None:
"""
Get the default role for a search space (used when accepting invites without a specific role).
Args:
session: Database session
search_space_id: Search space ID
Returns:
Default SearchSpaceRole or None
"""
result = await session.execute(
select(SearchSpaceRole).filter(
SearchSpaceRole.search_space_id == search_space_id,
SearchSpaceRole.is_default == True, # noqa: E712
)
)
return result.scalars().first()
async def get_owner_role(
session: AsyncSession,
search_space_id: int,
) -> SearchSpaceRole | None:
"""
Get the Owner role for a search space.
Args:
session: Database session
search_space_id: Search space ID
Returns:
Owner SearchSpaceRole or None
"""
result = await session.execute(
select(SearchSpaceRole).filter(
SearchSpaceRole.search_space_id == search_space_id,
SearchSpaceRole.name == "Owner",
)
)
return result.scalars().first()

View file

@ -469,6 +469,22 @@ def validate_connector_config(
if not isinstance(value, list) or not value:
raise ValueError(f"{field_name} must be a non-empty list of strings")
def validate_firecrawl_api_key_format() -> None:
"""Validate Firecrawl API key format if provided."""
api_key = config.get("FIRECRAWL_API_KEY", "")
if api_key and api_key.strip() and not api_key.strip().startswith("fc-"):
raise ValueError(
"Firecrawl API key should start with 'fc-'. Please verify your API key."
)
def validate_initial_urls() -> None:
initial_urls = config.get("INITIAL_URLS", "")
if initial_urls and initial_urls.strip():
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
for url in urls:
if not validators.url(url):
raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}")
# Lookup table for connector validation rules
connector_rules = {
"SERPER_API": {"required": ["SERPER_API_KEY"], "validators": {}},
@ -550,6 +566,14 @@ def validate_connector_config(
# "validators": {}
# },
"LUMA_CONNECTOR": {"required": ["LUMA_API_KEY"], "validators": {}},
"WEBCRAWLER_CONNECTOR": {
"required": [], # No required fields - API key is optional
"optional": ["FIRECRAWL_API_KEY", "INITIAL_URLS"],
"validators": {
"FIRECRAWL_API_KEY": lambda: validate_firecrawl_api_key_format(),
"INITIAL_URLS": lambda: validate_initial_urls(),
},
},
}
rules = connector_rules.get(connector_type_str)

View file

@ -11,7 +11,6 @@ dependencies = [
"docling>=2.15.0",
"fastapi>=0.115.8",
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
"firecrawl-py>=1.12.0",
"github3.py==4.0.1",
"google-api-python-client>=2.156.0",
"google-auth-oauthlib>=1.2.1",
@ -49,6 +48,7 @@ dependencies = [
"flower>=2.0.1",
"redis>=5.2.1",
"chonkie[all]>=1.4.0",
"firecrawl-py>=4.9.0",
]
[dependency-groups]

View file

@ -1541,19 +1541,20 @@ wheels = [
[[package]]
name = "firecrawl-py"
version = "2.8.0"
version = "4.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "aiohttp" },
{ name = "httpx" },
{ name = "nest-asyncio" },
{ name = "pydantic" },
{ name = "python-dotenv" },
{ name = "requests" },
{ name = "websockets" },
]
sdist = { url = "https://files.pythonhosted.org/packages/11/83/64127a0faafb027c2870c3919aae13fd6f8f8066d000bea93c880ab9772a/firecrawl_py-2.8.0.tar.gz", hash = "sha256:657795b6ddd63f0bd38b38bf0571187e0a66becda23d97c032801895257403c9", size = 37941 }
sdist = { url = "https://files.pythonhosted.org/packages/a5/2e/e4112ebd229bc03202584f5ad2ece81c26cb2a7bad0cd4773b8705d996e9/firecrawl_py-4.9.0.tar.gz", hash = "sha256:8e5740ed923c89e6066dfd63b0449f049bbd274652dfac3d735c9ae0572c4b0c", size = 153395 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/74/e6/e69bd2156856f2b1849244ca3b1d993676175b16acbf704ad85580ebaa3c/firecrawl_py-2.8.0-py3-none-any.whl", hash = "sha256:f2e148086aa1ca42f603a56009577b4f66a2c23893eaa71f7c9c0082b4fdcf60", size = 173118 },
{ url = "https://files.pythonhosted.org/packages/3a/cf/99848233303ca9c9d84cf22de08adc1051e8b6df672aeed14f32272df86b/firecrawl_py-4.9.0-py3-none-any.whl", hash = "sha256:adb027ed8bdda712201dc9727ead1a051dc3d114c2a0051de1f159c420703684", size = 190971 },
]
[[package]]
@ -5926,7 +5927,7 @@ requires-dist = [
{ name = "fastapi", specifier = ">=0.115.8" },
{ name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" },
{ name = "faster-whisper", specifier = ">=1.1.0" },
{ name = "firecrawl-py", specifier = ">=1.12.0" },
{ name = "firecrawl-py", specifier = ">=4.9.0" },
{ name = "flower", specifier = ">=2.0.1" },
{ name = "github3-py", specifier = "==4.0.1" },
{ name = "google-api-python-client", specifier = ">=2.156.0" },

View file

@ -4,6 +4,15 @@
"version": "0.0.8",
"description": "Extension to collect Browsing History for SurfSense.",
"author": "https://github.com/MODSetter",
"engines": {
"node": ">=18.0.0 <23.0.0",
"pnpm": ">=8.0.0"
},
"pnpm": {
"overrides": {
"sharp": "^0.33.5"
}
},
"scripts": {
"dev": "plasmo dev",
"build": "plasmo build",
@ -24,13 +33,14 @@
"dom-to-semantic-markdown": "^1.2.11",
"linkedom": "0.1.34",
"lucide-react": "^0.454.0",
"plasmo": "0.89.4",
"plasmo": "0.90.5",
"postcss-loader": "^8.1.1",
"radix-ui": "^1.0.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-hooks-global-state": "^2.1.0",
"react-router-dom": "^6.26.1",
"sharp": "^0.33.5",
"tailwind-merge": "^2.5.4",
"tailwindcss-animate": "^1.0.7"
},

File diff suppressed because it is too large Load diff

View file

@ -18,6 +18,7 @@ import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/com
import { Separator } from "@/components/ui/separator";
import { SidebarInset, SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
import { useLLMPreferences } from "@/hooks/use-llm-configs";
import { useUserAccess } from "@/hooks/use-rbac";
import { cn } from "@/lib/utils";
export function DashboardClientLayout({
@ -60,11 +61,15 @@ export function DashboardClientLayout({
}, [activeChatId, isChatPannelOpen]);
const { loading, error, isOnboardingComplete } = useLLMPreferences(searchSpaceIdNum);
const { access, loading: accessLoading } = useUserAccess(searchSpaceIdNum);
const [hasCheckedOnboarding, setHasCheckedOnboarding] = useState(false);
// Skip onboarding check if we're already on the onboarding page
const isOnboardingPage = pathname?.includes("/onboard");
// Only owners should see onboarding - invited members use existing config
const isOwner = access?.is_owner ?? false;
// Translate navigation items
const tNavMenu = useTranslations("nav_menu");
const translatedNavMain = useMemo(() => {
@ -102,11 +107,13 @@ export function DashboardClientLayout({
return;
}
// Only check once after preferences have loaded
if (!loading && !hasCheckedOnboarding) {
// Wait for both preferences and access data to load
if (!loading && !accessLoading && !hasCheckedOnboarding) {
const onboardingComplete = isOnboardingComplete();
if (!onboardingComplete) {
// Only redirect to onboarding if user is the owner and onboarding is not complete
// Invited members (non-owners) should skip onboarding and use existing config
if (!onboardingComplete && isOwner) {
router.push(`/dashboard/${searchSpaceId}/onboard`);
}
@ -114,8 +121,10 @@ export function DashboardClientLayout({
}
}, [
loading,
accessLoading,
isOnboardingComplete,
isOnboardingPage,
isOwner,
router,
searchSpaceId,
hasCheckedOnboarding,
@ -145,7 +154,7 @@ export function DashboardClientLayout({
}, [chat_id, search_space_id]);
// Show loading screen while checking onboarding status (only on first load)
if (!hasCheckedOnboarding && loading && !isOnboardingPage) {
if (!hasCheckedOnboarding && (loading || accessLoading) && !isOnboardingPage) {
return (
<div className="flex flex-col items-center justify-center min-h-screen space-y-4">
<Card className="w-[350px] bg-background/60 backdrop-blur-sm">

View file

@ -18,7 +18,16 @@ import {
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { Form } from "@/components/ui/form";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Textarea } from "@/components/ui/textarea";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { useConnectorEditPage } from "@/hooks/use-connector-edit-page";
// Import Utils, Types, Hook, and Components
@ -282,6 +291,40 @@ export default function EditConnectorPage() {
placeholder="Your Elasticsearch API Key"
/>
)}
{/* == Webcrawler == */}
{connector.connector_type === "WEBCRAWLER_CONNECTOR" && (
<div className="space-y-4">
<EditSimpleTokenForm
control={editForm.control}
fieldName="FIRECRAWL_API_KEY"
fieldLabel="Firecrawl API Key (Optional)"
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
placeholder="fc-xxxxxxxxxxxxx"
/>
<FormField
control={editForm.control}
name="INITIAL_URLS"
render={({ field }) => (
<FormItem>
<FormLabel>URLs to Crawl</FormLabel>
<FormControl>
<Textarea
placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
className="min-h-[150px] font-mono text-sm"
{...field}
/>
</FormControl>
<FormDescription>
Enter URLs to crawl (one per line). These URLs will be indexed when you
trigger indexing.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
</div>
)}
</CardContent>
<CardFooter className="border-t pt-6">
<Button type="submit" disabled={isSaving} className="w-full sm:w-auto">

View file

@ -55,6 +55,7 @@ const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable Connector",
LUMA_CONNECTOR: "Luma Connector",
ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector",
WEBCRAWLER_CONNECTOR: "Web Page Connector",
// Add other connector types here as needed
};
return typeMap[type] || type;
@ -75,6 +76,7 @@ const getApiKeyFieldName = (connectorType: string): string => {
LINKUP_API: "LINKUP_API_KEY",
LUMA_CONNECTOR: "LUMA_API_KEY",
ELASTICSEARCH_CONNECTOR: "ELASTICSEARCH_API_KEY",
WEBCRAWLER_CONNECTOR: "FIRECRAWL_API_KEY",
};
return fieldMap[connectorType] || "";
};

View file

@ -0,0 +1,331 @@
"use client";
import { zodResolver } from "@hookform/resolvers/zod";
import { ArrowLeft, Check, Globe, Loader2 } from "lucide-react";
import { motion } from "motion/react";
import Link from "next/link";
import { useParams, useRouter } from "next/navigation";
import { useEffect, useState } from "react";
import { useForm } from "react-hook-form";
import { toast } from "sonner";
import * as z from "zod";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Input } from "@/components/ui/input";
import { Textarea } from "@/components/ui/textarea";
import { EnumConnectorName } from "@/contracts/enums/connector";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import {
type SearchSourceConnector,
useSearchSourceConnectors,
} from "@/hooks/use-search-source-connectors";
// Define the form schema with Zod
const webcrawlerConnectorFormSchema = z.object({
name: z.string().min(3, {
message: "Connector name must be at least 3 characters.",
}),
api_key: z.string().optional(),
initial_urls: z.string().optional(),
});
// Define the type for the form values
type WebcrawlerConnectorFormValues = z.infer<typeof webcrawlerConnectorFormSchema>;
export default function WebcrawlerConnectorPage() {
const router = useRouter();
const params = useParams();
const searchSpaceId = params.search_space_id as string;
const [isSubmitting, setIsSubmitting] = useState(false);
const [doesConnectorExist, setDoesConnectorExist] = useState(false);
const { fetchConnectors, createConnector } = useSearchSourceConnectors(
true,
parseInt(searchSpaceId)
);
// Initialize the form
const form = useForm<WebcrawlerConnectorFormValues>({
resolver: zodResolver(webcrawlerConnectorFormSchema),
defaultValues: {
name: "Web Pages",
api_key: "",
initial_urls: "",
},
});
useEffect(() => {
fetchConnectors(parseInt(searchSpaceId))
.then((data) => {
if (data && Array.isArray(data)) {
const connector = data.find(
(c: SearchSourceConnector) =>
c.connector_type === EnumConnectorName.WEBCRAWLER_CONNECTOR
);
if (connector) {
setDoesConnectorExist(true);
}
}
})
.catch((error) => {
console.error("Error fetching connectors:", error);
});
}, [fetchConnectors, searchSpaceId]);
// Handle form submission
const onSubmit = async (values: WebcrawlerConnectorFormValues) => {
setIsSubmitting(true);
try {
const config: Record<string, string> = {};
// Only add API key to config if provided
if (values.api_key && values.api_key.trim()) {
config.FIRECRAWL_API_KEY = values.api_key;
}
// Parse initial URLs if provided
if (values.initial_urls && values.initial_urls.trim()) {
config.INITIAL_URLS = values.initial_urls;
}
await createConnector(
{
name: values.name,
connector_type: EnumConnectorName.WEBCRAWLER_CONNECTOR,
config: config,
is_indexable: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,
next_scheduled_at: null,
},
parseInt(searchSpaceId)
);
toast.success("Webcrawler connector created successfully!");
// Navigate back to connectors page
router.push(`/dashboard/${searchSpaceId}/connectors`);
} catch (error) {
console.error("Error creating connector:", error);
toast.error(error instanceof Error ? error.message : "Failed to create connector");
} finally {
setIsSubmitting(false);
}
};
return (
<div className="container mx-auto py-8 max-w-2xl">
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.5 }}
>
{/* Header */}
<div className="mb-8">
<Link
href={`/dashboard/${searchSpaceId}/connectors/add`}
className="inline-flex items-center text-sm text-muted-foreground hover:text-foreground mb-4"
>
<ArrowLeft className="mr-2 h-4 w-4" />
Back to connectors
</Link>
<div className="flex items-center gap-4">
<div className="flex h-12 w-12 items-center justify-center rounded-lg">
{getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6")}
</div>
<div>
<h1 className="text-3xl font-bold tracking-tight">Connect Web Pages</h1>
<p className="text-muted-foreground">Crawl and index web pages for search.</p>
</div>
</div>
</div>
{/* Connection Card */}
{!doesConnectorExist ? (
<Card>
<CardHeader>
<CardTitle>Set Up Web Page crawler</CardTitle>
<CardDescription>
Configure your web page crawler to index web pages. Optionally add a Firecrawl API
key for enhanced crawling capabilities.
</CardDescription>
</CardHeader>
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)}>
<CardContent className="space-y-4">
<FormField
control={form.control}
name="name"
render={({ field }) => (
<FormItem>
<FormLabel>Connector Name</FormLabel>
<FormControl>
<Input placeholder="My Web Crawler" {...field} />
</FormControl>
<FormDescription>
A friendly name to identify this connector.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="api_key"
render={({ field }) => (
<FormItem>
<FormLabel>Firecrawl API Key (Optional)</FormLabel>
<FormControl>
<Input type="password" placeholder="fc-xxxxxxxxxxxxx" {...field} />
</FormControl>
<FormDescription>
Add a Firecrawl API key for enhanced crawling. If not provided, will use
AsyncChromiumLoader as fallback.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="initial_urls"
render={({ field }) => (
<FormItem>
<FormLabel>Initial URLs (Optional)</FormLabel>
<FormControl>
<Textarea
placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
className="min-h-[100px] font-mono text-sm"
{...field}
/>
</FormControl>
<FormDescription>
Enter URLs to crawl (one per line). You can add more URLs later.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<div className="space-y-2 pt-2">
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Crawl any public web page</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Extract markdown content automatically</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Detect content changes and update documents</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Works with or without Firecrawl API key</span>
</div>
</div>
</CardContent>
<CardFooter className="flex justify-between">
<Button
type="button"
variant="outline"
onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
>
Cancel
</Button>
<Button type="submit" disabled={isSubmitting}>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
Setting up...
</>
) : (
<>
<Globe className="mr-2 h-4 w-4" />
Create Crawler
</>
)}
</Button>
</CardFooter>
</form>
</Form>
</Card>
) : (
/* Success Card */
<Card>
<CardHeader>
<CardTitle> Your web page crawler is successfully set up!</CardTitle>
<CardDescription>
You can now add URLs to crawl from the connector management page.
</CardDescription>
</CardHeader>
</Card>
)}
{/* Help Section */}
{!doesConnectorExist && (
<Card className="mt-6">
<CardHeader>
<CardTitle className="text-lg">How It Works</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
<div>
<h4 className="font-medium mb-2">1. Choose Your Crawler Method</h4>
<p className="text-sm text-muted-foreground">
<strong>With Firecrawl (Recommended):</strong> Get your API key from{" "}
<a
href="https://firecrawl.dev"
target="_blank"
rel="noopener noreferrer"
className="text-primary hover:underline"
>
firecrawl.dev
</a>{" "}
for faster, more reliable crawling with better content extraction.
</p>
<p className="text-sm text-muted-foreground mt-2">
<strong>Without Firecrawl:</strong> The crawler will use AsyncChromiumLoader as a
free fallback option. This works well for most websites but may be slower.
</p>
</div>
<div>
<h4 className="font-medium mb-2">2. Add URLs to Crawl (Optional)</h4>
<p className="text-sm text-muted-foreground">
You can add initial URLs now or add them later from the connector management page.
Enter one URL per line.
</p>
</div>
<div>
<h4 className="font-medium mb-2">3. Manage Your Crawler</h4>
<p className="text-sm text-muted-foreground">
After setup, you can add more URLs, trigger manual crawls, or set up periodic
indexing to keep your content up-to-date.
</p>
</div>
</CardContent>
</Card>
)}
</motion.div>
</div>
);
}

View file

@ -1,201 +0,0 @@
"use client";
import { type Tag, TagInput } from "emblor";
import { Globe, Loader2 } from "lucide-react";
import { useParams, useRouter } from "next/navigation";
import { useTranslations } from "next-intl";
import { useState } from "react";
import { toast } from "sonner";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { Label } from "@/components/ui/label";
// URL validation regex
const urlRegex = /^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([/\w .-]*)*\/?$/;
export default function WebpageCrawler() {
const t = useTranslations("add_webpage");
const params = useParams();
const router = useRouter();
const search_space_id = params.search_space_id as string;
const [urlTags, setUrlTags] = useState<Tag[]>([]);
const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState<string | null>(null);
// Function to validate a URL
const isValidUrl = (url: string): boolean => {
return urlRegex.test(url);
};
// Function to handle URL submission
const handleSubmit = async () => {
// Validate that we have at least one URL
if (urlTags.length === 0) {
setError(t("error_no_url"));
return;
}
// Validate all URLs
const invalidUrls = urlTags.filter((tag) => !isValidUrl(tag.text));
if (invalidUrls.length > 0) {
setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") }));
return;
}
setError(null);
setIsSubmitting(true);
try {
toast(t("crawling_toast"), {
description: t("crawling_toast_desc"),
});
// Extract URLs from tags
const urls = urlTags.map((tag) => tag.text);
// Make API call to backend
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
body: JSON.stringify({
document_type: "CRAWLED_URL",
content: urls,
search_space_id: parseInt(search_space_id),
}),
}
);
if (!response.ok) {
throw new Error("Failed to crawl URLs");
}
await response.json();
toast(t("success_toast"), {
description: t("success_toast_desc"),
});
// Redirect to documents page
router.push(`/dashboard/${search_space_id}/documents`);
} catch (error: any) {
setError(error.message || t("error_generic"));
toast(t("error_toast"), {
description: `${t("error_toast_desc")}: ${error.message}`,
});
} finally {
setIsSubmitting(false);
}
};
// Function to add a new URL tag
const handleAddTag = (text: string) => {
// Basic URL validation
if (!isValidUrl(text)) {
toast(t("invalid_url_toast"), {
description: t("invalid_url_toast_desc"),
});
return;
}
// Check for duplicates
if (urlTags.some((tag) => tag.text === text)) {
toast(t("duplicate_url_toast"), {
description: t("duplicate_url_toast_desc"),
});
return;
}
// Add the new tag
const newTag: Tag = {
id: Date.now().toString(),
text: text,
};
setUrlTags([...urlTags, newTag]);
};
return (
<div className="container mx-auto py-8">
<Card className="max-w-2xl mx-auto">
<CardHeader>
<CardTitle className="flex items-center gap-2">
<Globe className="h-5 w-5" />
{t("title")}
</CardTitle>
<CardDescription>{t("subtitle")}</CardDescription>
</CardHeader>
<CardContent>
<div className="space-y-4">
<div className="space-y-2">
<Label htmlFor="url-input">{t("label")}</Label>
<TagInput
id="url-input"
tags={urlTags}
setTags={setUrlTags}
placeholder={t("placeholder")}
onAddTag={handleAddTag}
styleClasses={{
inlineTagsContainer:
"border-input rounded-lg bg-background shadow-sm shadow-black/5 transition-shadow focus-within:border-ring focus-within:outline-none focus-within:ring-[3px] focus-within:ring-ring/20 p-1 gap-1",
input: "w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7",
tag: {
body: "h-7 relative bg-background border border-input hover:bg-background rounded-md font-medium text-xs ps-2 pe-7 flex",
closeButton:
"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-foreground",
},
}}
activeTagIndex={activeTagIndex}
setActiveTagIndex={setActiveTagIndex}
/>
<p className="text-xs text-muted-foreground mt-1">{t("hint")}</p>
</div>
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
<div className="bg-muted/50 rounded-lg p-4 text-sm">
<h4 className="font-medium mb-2">{t("tips_title")}</h4>
<ul className="list-disc pl-5 space-y-1 text-muted-foreground">
<li>{t("tip_1")}</li>
<li>{t("tip_2")}</li>
<li>{t("tip_3")}</li>
<li>{t("tip_4")}</li>
</ul>
</div>
</div>
</CardContent>
<CardFooter className="flex justify-between">
<Button
variant="outline"
onClick={() => router.push(`/dashboard/${search_space_id}/documents`)}
>
{t("cancel")}
</Button>
<Button onClick={handleSubmit} disabled={isSubmitting || urlTags.length === 0}>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
{t("submitting")}
</>
) : (
t("submit")
)}
</Button>
</CardFooter>
</Card>
</div>
);
}

View file

@ -52,6 +52,12 @@ export default function DashboardLayout({
},
],
},
{
title: "Team",
url: `/dashboard/${search_space_id}/team`,
icon: "Users",
items: [],
},
{
title: "Settings",
url: `/dashboard/${search_space_id}/settings`,

View file

@ -1126,7 +1126,7 @@ function LogRowActions({ row, t }: { row: Row<Log>; t: (key: string) => string }
setIsDeleting(true);
try {
await deleteLog(log.id);
toast.success(t("log_deleted_success"));
// toast.success(t("log_deleted_success"));
await refreshLogs();
} catch (error) {
console.error("Error deleting log:", error);

View file

@ -1,9 +1,9 @@
"use client";
import { IconBrandYoutube } from "@tabler/icons-react";
import { Cable, Database, Upload } from "lucide-react";
import { Cable, Database, Globe, Upload } from "lucide-react";
import { motion } from "motion/react";
import { useParams, useSearchParams } from "next/navigation";
import { useParams, useRouter, useSearchParams } from "next/navigation";
import { useEffect, useState } from "react";
import { ConnectorsTab } from "@/components/sources/ConnectorsTab";
import { DocumentUploadTab } from "@/components/sources/DocumentUploadTab";
@ -12,6 +12,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
export default function AddSourcesPage() {
const params = useParams();
const router = useRouter();
const searchParams = useSearchParams();
const search_space_id = params.search_space_id as string;
const [activeTab, setActiveTab] = useState("documents");
@ -24,6 +25,14 @@ export default function AddSourcesPage() {
}
}, [searchParams]);
const handleTabChange = (value: string) => {
if (value === "webpages") {
router.push(`/dashboard/${search_space_id}/connectors/add/webcrawler-connector`);
} else {
setActiveTab(value);
}
};
return (
<div className="container mx-auto py-8 px-4">
<motion.div
@ -42,19 +51,26 @@ export default function AddSourcesPage() {
</div>
{/* Tabs */}
<Tabs value={activeTab} onValueChange={setActiveTab} className="w-full">
<TabsList className="grid w-full max-w-2xl mx-auto grid-cols-3 h-12">
<Tabs value={activeTab} onValueChange={handleTabChange} className="w-full">
<TabsList className="grid w-full max-w-3xl mx-auto grid-cols-4 h-12">
<TabsTrigger value="documents" className="flex items-center gap-2">
<Upload className="h-4 w-4" />
Documents
<span className="hidden sm:inline">Documents</span>
<span className="sm:hidden">Docs</span>
</TabsTrigger>
<TabsTrigger value="youtube" className="flex items-center gap-2">
<IconBrandYoutube className="h-4 w-4" />
YouTube
</TabsTrigger>
<TabsTrigger value="webpages" className="flex items-center gap-2">
<Globe className="h-4 w-4" />
<span className="hidden sm:inline">Web Pages</span>
<span className="sm:hidden">Web</span>
</TabsTrigger>
<TabsTrigger value="connectors" className="flex items-center gap-2">
<Cable className="h-4 w-4" />
Connectors
<span className="hidden sm:inline">Connectors</span>
<span className="sm:hidden">More</span>
</TabsTrigger>
</TabsList>

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
"use client";
import { AlertCircle, Loader2, Plus, Search, Trash2 } from "lucide-react";
import { AlertCircle, Loader2, Plus, Search, Trash2, UserCheck, Users } from "lucide-react";
import { motion, type Variants } from "motion/react";
import Image from "next/image";
import Link from "next/link";
@ -22,6 +22,7 @@ import {
AlertDialogTitle,
AlertDialogTrigger,
} from "@/components/ui/alert-dialog";
import { Badge } from "@/components/ui/badge";
import { Button } from "@/components/ui/button";
import {
Card,
@ -308,16 +309,30 @@ const DashboardPage = () => {
>
<div className="flex flex-1 flex-col justify-between p-1">
<div>
<h3 className="font-medium text-lg">{space.name}</h3>
<div className="flex items-center gap-2">
<h3 className="font-medium text-lg">{space.name}</h3>
{!space.is_owner && (
<Badge variant="secondary" className="text-xs font-normal">
{t("shared")}
</Badge>
)}
</div>
<p className="mt-1 text-sm text-muted-foreground">
{space.description}
</p>
</div>
<div className="mt-4 text-xs text-muted-foreground">
{/* <span>{space.title}</span> */}
<div className="mt-4 flex items-center justify-between text-xs text-muted-foreground">
<span>
{t("created")} {formatDate(space.created_at)}
</span>
<div className="flex items-center gap-1">
{space.is_owner ? (
<UserCheck className="h-3.5 w-3.5" />
) : (
<Users className="h-3.5 w-3.5" />
)}
<span>{space.member_count}</span>
</div>
</div>
</div>
</Link>

View file

@ -0,0 +1,336 @@
"use client";
import {
AlertCircle,
ArrowRight,
CheckCircle2,
Clock,
Loader2,
LogIn,
Shield,
Sparkles,
Users,
XCircle,
} from "lucide-react";
import { motion } from "motion/react";
import Image from "next/image";
import Link from "next/link";
import { useParams, useRouter } from "next/navigation";
import { use, useEffect, useState } from "react";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { useInviteInfo } from "@/hooks/use-rbac";
export default function InviteAcceptPage() {
const params = useParams();
const router = useRouter();
const inviteCode = params.invite_code as string;
const { inviteInfo, loading, acceptInvite } = useInviteInfo(inviteCode);
const [accepting, setAccepting] = useState(false);
const [accepted, setAccepted] = useState(false);
const [acceptedData, setAcceptedData] = useState<{
search_space_id: number;
search_space_name: string;
role_name: string;
} | null>(null);
const [error, setError] = useState<string | null>(null);
const [isLoggedIn, setIsLoggedIn] = useState<boolean | null>(null);
// Check if user is logged in
useEffect(() => {
if (typeof window !== "undefined") {
const token = localStorage.getItem("surfsense_bearer_token");
setIsLoggedIn(!!token);
}
}, []);
const handleAccept = async () => {
setAccepting(true);
setError(null);
try {
const result = await acceptInvite();
if (result) {
setAccepted(true);
setAcceptedData(result);
}
} catch (err: any) {
setError(err.message || "Failed to accept invite");
} finally {
setAccepting(false);
}
};
const handleLoginRedirect = () => {
// Store the invite code to redirect back after login
localStorage.setItem("pending_invite_code", inviteCode);
router.push("/auth");
};
// Check for pending invite after login
useEffect(() => {
if (isLoggedIn && typeof window !== "undefined") {
const pendingInvite = localStorage.getItem("pending_invite_code");
if (pendingInvite === inviteCode) {
localStorage.removeItem("pending_invite_code");
// Auto-accept the invite after redirect
handleAccept();
}
}
}, [isLoggedIn, inviteCode]);
return (
<div className="min-h-screen flex items-center justify-center p-4 bg-gradient-to-br from-background via-background to-primary/5">
{/* Background decoration */}
<div className="absolute inset-0 overflow-hidden pointer-events-none">
<div className="absolute -top-1/2 -right-1/2 w-full h-full bg-gradient-to-bl from-primary/10 via-transparent to-transparent rounded-full blur-3xl" />
<div className="absolute -bottom-1/2 -left-1/2 w-full h-full bg-gradient-to-tr from-violet-500/10 via-transparent to-transparent rounded-full blur-3xl" />
</div>
<motion.div
initial={{ opacity: 0, y: 20, scale: 0.95 }}
animate={{ opacity: 1, y: 0, scale: 1 }}
transition={{ duration: 0.5, ease: "easeOut" }}
className="w-full max-w-md relative z-10"
>
<Card className="border-none shadow-2xl bg-card/80 backdrop-blur-xl">
{loading || isLoggedIn === null ? (
<CardContent className="flex flex-col items-center justify-center py-16">
<motion.div
animate={{ rotate: 360 }}
transition={{ duration: 1, repeat: Infinity, ease: "linear" }}
>
<Loader2 className="h-12 w-12 text-primary" />
</motion.div>
<p className="mt-4 text-muted-foreground">Loading invite details...</p>
</CardContent>
) : accepted && acceptedData ? (
<>
<CardHeader className="text-center pb-4">
<motion.div
initial={{ scale: 0 }}
animate={{ scale: 1 }}
transition={{ type: "spring", stiffness: 200, damping: 15 }}
className="mx-auto mb-4 h-20 w-20 rounded-full bg-gradient-to-br from-emerald-500/20 to-emerald-500/5 flex items-center justify-center ring-4 ring-emerald-500/20"
>
<CheckCircle2 className="h-10 w-10 text-emerald-500" />
</motion.div>
<CardTitle className="text-2xl">Welcome to the team!</CardTitle>
<CardDescription>
You've successfully joined {acceptedData.search_space_name}
</CardDescription>
</CardHeader>
<CardContent className="space-y-4">
<div className="bg-muted/50 rounded-lg p-4 space-y-3">
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-primary/10 flex items-center justify-center">
<Users className="h-5 w-5 text-primary" />
</div>
<div>
<p className="font-medium">{acceptedData.search_space_name}</p>
<p className="text-sm text-muted-foreground">Search Space</p>
</div>
</div>
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-violet-500/10 flex items-center justify-center">
<Shield className="h-5 w-5 text-violet-500" />
</div>
<div>
<p className="font-medium">{acceptedData.role_name}</p>
<p className="text-sm text-muted-foreground">Your Role</p>
</div>
</div>
</div>
</CardContent>
<CardFooter>
<Button
className="w-full gap-2"
onClick={() => router.push(`/dashboard/${acceptedData.search_space_id}`)}
>
Go to Search Space
<ArrowRight className="h-4 w-4" />
</Button>
</CardFooter>
</>
) : !inviteInfo?.is_valid ? (
<>
<CardHeader className="text-center pb-4">
<motion.div
initial={{ scale: 0 }}
animate={{ scale: 1 }}
transition={{ type: "spring", stiffness: 200, damping: 15 }}
className="mx-auto mb-4 h-20 w-20 rounded-full bg-gradient-to-br from-destructive/20 to-destructive/5 flex items-center justify-center ring-4 ring-destructive/20"
>
<XCircle className="h-10 w-10 text-destructive" />
</motion.div>
<CardTitle className="text-2xl">Invalid Invite</CardTitle>
<CardDescription>
{inviteInfo?.message || "This invite link is no longer valid"}
</CardDescription>
</CardHeader>
<CardContent className="text-center">
<p className="text-sm text-muted-foreground">
The invite may have expired, reached its maximum uses, or been revoked by the
owner.
</p>
</CardContent>
<CardFooter>
<Button
variant="outline"
className="w-full"
onClick={() => router.push("/dashboard")}
>
Go to Dashboard
</Button>
</CardFooter>
</>
) : !isLoggedIn ? (
<>
<CardHeader className="text-center pb-4">
<motion.div
initial={{ scale: 0 }}
animate={{ scale: 1 }}
transition={{ type: "spring", stiffness: 200, damping: 15 }}
className="mx-auto mb-4 h-20 w-20 rounded-full bg-gradient-to-br from-primary/20 to-primary/5 flex items-center justify-center ring-4 ring-primary/20"
>
<Sparkles className="h-10 w-10 text-primary" />
</motion.div>
<CardTitle className="text-2xl">You're Invited!</CardTitle>
<CardDescription>
Sign in to join {inviteInfo?.search_space_name || "this search space"}
</CardDescription>
</CardHeader>
<CardContent className="space-y-4">
<div className="bg-muted/50 rounded-lg p-4 space-y-3">
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-primary/10 flex items-center justify-center">
<Users className="h-5 w-5 text-primary" />
</div>
<div>
<p className="font-medium">{inviteInfo?.search_space_name}</p>
<p className="text-sm text-muted-foreground">Search Space</p>
</div>
</div>
{inviteInfo?.role_name && (
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-violet-500/10 flex items-center justify-center">
<Shield className="h-5 w-5 text-violet-500" />
</div>
<div>
<p className="font-medium">{inviteInfo.role_name}</p>
<p className="text-sm text-muted-foreground">Role you'll receive</p>
</div>
</div>
)}
</div>
</CardContent>
<CardFooter>
<Button className="w-full gap-2" onClick={handleLoginRedirect}>
<LogIn className="h-4 w-4" />
Sign in to Accept
</Button>
</CardFooter>
</>
) : (
<>
<CardHeader className="text-center pb-4">
<motion.div
initial={{ scale: 0 }}
animate={{ scale: 1 }}
transition={{ type: "spring", stiffness: 200, damping: 15 }}
className="mx-auto mb-4 h-20 w-20 rounded-full bg-gradient-to-br from-primary/20 to-primary/5 flex items-center justify-center ring-4 ring-primary/20"
>
<Sparkles className="h-10 w-10 text-primary" />
</motion.div>
<CardTitle className="text-2xl">You're Invited!</CardTitle>
<CardDescription>
Accept this invite to join {inviteInfo?.search_space_name || "this search space"}
</CardDescription>
</CardHeader>
<CardContent className="space-y-4">
<div className="bg-muted/50 rounded-lg p-4 space-y-3">
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-primary/10 flex items-center justify-center">
<Users className="h-5 w-5 text-primary" />
</div>
<div>
<p className="font-medium">{inviteInfo?.search_space_name}</p>
<p className="text-sm text-muted-foreground">Search Space</p>
</div>
</div>
{inviteInfo?.role_name && (
<div className="flex items-center gap-3">
<div className="h-10 w-10 rounded-lg bg-violet-500/10 flex items-center justify-center">
<Shield className="h-5 w-5 text-violet-500" />
</div>
<div>
<p className="font-medium">{inviteInfo.role_name}</p>
<p className="text-sm text-muted-foreground">Role you'll receive</p>
</div>
</div>
)}
</div>
{error && (
<motion.div
initial={{ opacity: 0, y: -10 }}
animate={{ opacity: 1, y: 0 }}
className="flex items-center gap-2 p-3 bg-destructive/10 text-destructive rounded-lg text-sm"
>
<AlertCircle className="h-4 w-4 shrink-0" />
{error}
</motion.div>
)}
</CardContent>
<CardFooter className="flex gap-2">
<Button
variant="outline"
className="flex-1"
onClick={() => router.push("/dashboard")}
>
Cancel
</Button>
<Button className="flex-1 gap-2" onClick={handleAccept} disabled={accepting}>
{accepting ? (
<>
<Loader2 className="h-4 w-4 animate-spin" />
Accepting...
</>
) : (
<>
<CheckCircle2 className="h-4 w-4" />
Accept Invite
</>
)}
</Button>
</CardFooter>
</>
)}
</Card>
{/* Branding */}
<motion.div
initial={{ opacity: 0 }}
animate={{ opacity: 1 }}
transition={{ delay: 0.3 }}
className="mt-6 text-center"
>
<Link
href="/"
className="inline-flex items-center gap-2 text-muted-foreground hover:text-foreground transition-colors"
>
<Image src="/icon-128.png" alt="SurfSense" width={24} height={24} className="rounded" />
<span className="text-sm font-medium">SurfSense</span>
</Link>
</motion.div>
</motion.div>
</div>
);
}

View file

@ -1,8 +1,8 @@
"use client";
import { ChatInput } from "@llamaindex/chat-ui";
import { Brain, Check, FolderOpen, Minus, Plus, Zap } from "lucide-react";
import { useParams } from "next/navigation";
import { Brain, Check, FolderOpen, Minus, Plus, PlusCircle, Zap } from "lucide-react";
import { useParams, useRouter } from "next/navigation";
import React, { Suspense, useCallback, useState } from "react";
import { DocumentsDataTable } from "@/components/chat/DocumentsDataTable";
import { Badge } from "@/components/ui/badge";
@ -115,6 +115,7 @@ const ConnectorSelector = React.memo(
selectedConnectors?: string[];
}) => {
const { search_space_id } = useParams();
const router = useRouter();
const [isOpen, setIsOpen] = useState(false);
// Fetch immediately (not lazy) so the button can show the correct count
@ -247,9 +248,19 @@ const ConnectorSelector = React.memo(
<Brain className="h-8 w-8 text-muted-foreground" />
</div>
<h4 className="text-sm font-medium mb-1">No sources found</h4>
<p className="text-xs text-muted-foreground max-w-xs">
<p className="text-xs text-muted-foreground max-w-xs mb-4">
Add documents or configure search connectors for this search space
</p>
<Button
onClick={() => {
setIsOpen(false);
router.push(`/dashboard/${search_space_id}/sources/add`);
}}
className="gap-2"
>
<PlusCircle className="h-4 w-4" />
Add Sources
</Button>
</div>
) : (
<>

View file

@ -188,6 +188,7 @@ export function DashboardBreadcrumb() {
"linkup-api": "LinkUp API",
"luma-connector": "Luma",
"elasticsearch-connector": "Elasticsearch",
"webcrawler-connector": "Web Pages",
};
const connectorLabel = connectorLabels[connectorType] || connectorType;

View file

@ -52,5 +52,7 @@ export const editConnectorSchema = z.object({
GOOGLE_CALENDAR_CALENDAR_IDS: z.string().optional(),
LUMA_API_KEY: z.string().optional(),
ELASTICSEARCH_API_KEY: z.string().optional(),
FIRECRAWL_API_KEY: z.string().optional(),
INITIAL_URLS: z.string().optional(),
});
export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>;

View file

@ -29,6 +29,7 @@ const INTEGRATIONS: Integration[] = [
// Documentation & Knowledge
{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
{ name: "Web Pages", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg" },
// Cloud Storage
{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },

View file

@ -8,6 +8,8 @@ import {
FileText,
MessageSquare,
Sparkles,
UserPlus,
Users,
Zap,
} from "lucide-react";
import { motion } from "motion/react";
@ -50,16 +52,60 @@ export function CompletionStep({ searchSpaceId }: CompletionStepProps) {
<p className="text-muted-foreground">Choose an option to continue</p>
</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
{/* Manage Team Card */}
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ delay: 0.9, type: "spring", stiffness: 300, damping: 25 }}
>
<Card className="h-full border-2 hover:border-emerald-500/50 transition-all duration-300 hover:shadow-xl hover:shadow-emerald-500/10 cursor-pointer group relative overflow-hidden">
<div className="absolute top-0 right-0 w-32 h-32 bg-gradient-to-br from-emerald-500/10 to-transparent rounded-full blur-2xl -mr-16 -mt-16 group-hover:scale-150 transition-transform duration-500" />
<CardHeader className="relative">
<div className="w-12 h-12 bg-gradient-to-br from-emerald-500/20 to-emerald-600/10 rounded-xl flex items-center justify-center mb-3 group-hover:scale-110 group-hover:rotate-3 transition-all duration-300 ring-1 ring-emerald-500/20">
<Users className="w-6 h-6 text-emerald-600 dark:text-emerald-400" />
</div>
<CardTitle className="text-lg">Manage Team</CardTitle>
<CardDescription>
Invite team members and collaborate on your search space
</CardDescription>
</CardHeader>
<CardContent className="space-y-4 relative">
<div className="space-y-2 text-sm text-muted-foreground">
<div className="flex items-center gap-2">
<UserPlus className="w-4 h-4 text-emerald-500" />
<span>Invite team members</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Assign roles & permissions</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Collaborate together</span>
</div>
</div>
<Button
className="w-full bg-emerald-600 hover:bg-emerald-700 text-white group-hover:shadow-lg group-hover:shadow-emerald-500/25 transition-all duration-300"
onClick={() => router.push(`/dashboard/${searchSpaceId}/team`)}
>
Manage Team
<ArrowRight className="w-4 h-4 ml-2 group-hover:translate-x-1 transition-transform" />
</Button>
</CardContent>
</Card>
</motion.div>
{/* Add Sources Card */}
<motion.div
initial={{ opacity: 0, x: -20 }}
animate={{ opacity: 1, x: 0 }}
transition={{ delay: 0.7 }}
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ delay: 0.7, type: "spring", stiffness: 300, damping: 25 }}
>
<Card className="h-full border-2 hover:border-primary/50 transition-all hover:shadow-lg cursor-pointer group">
<CardHeader>
<div className="w-12 h-12 bg-blue-100 dark:bg-blue-950 rounded-lg flex items-center justify-center mb-3 group-hover:scale-110 transition-transform">
<Card className="h-full border-2 hover:border-blue-500/50 transition-all duration-300 hover:shadow-xl hover:shadow-blue-500/10 cursor-pointer group relative overflow-hidden">
<div className="absolute top-0 right-0 w-32 h-32 bg-gradient-to-br from-blue-500/10 to-transparent rounded-full blur-2xl -mr-16 -mt-16 group-hover:scale-150 transition-transform duration-500" />
<CardHeader className="relative">
<div className="w-12 h-12 bg-gradient-to-br from-blue-500/20 to-blue-600/10 rounded-xl flex items-center justify-center mb-3 group-hover:scale-110 group-hover:rotate-3 transition-all duration-300 ring-1 ring-blue-500/20">
<FileText className="w-6 h-6 text-blue-600 dark:text-blue-400" />
</div>
<CardTitle className="text-lg">Add Sources</CardTitle>
@ -67,27 +113,27 @@ export function CompletionStep({ searchSpaceId }: CompletionStepProps) {
Connect your data sources to start building your knowledge base
</CardDescription>
</CardHeader>
<CardContent className="space-y-4">
<CardContent className="space-y-4 relative">
<div className="space-y-2 text-sm text-muted-foreground">
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Connect documents and files</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Import from various sources</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Build your knowledge base</span>
</div>
</div>
<Button
className="w-full group-hover:bg-primary/90"
className="w-full bg-blue-600 hover:bg-blue-700 text-white group-hover:shadow-lg group-hover:shadow-blue-500/25 transition-all duration-300"
onClick={() => router.push(`/dashboard/${searchSpaceId}/sources/add`)}
>
Add Sources
<ArrowRight className="w-4 h-4 ml-2" />
<ArrowRight className="w-4 h-4 ml-2 group-hover:translate-x-1 transition-transform" />
</Button>
</CardContent>
</Card>
@ -95,13 +141,14 @@ export function CompletionStep({ searchSpaceId }: CompletionStepProps) {
{/* Start Chatting Card */}
<motion.div
initial={{ opacity: 0, x: 20 }}
animate={{ opacity: 1, x: 0 }}
transition={{ delay: 0.8 }}
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ delay: 0.8, type: "spring", stiffness: 300, damping: 25 }}
>
<Card className="h-full border-2 hover:border-primary/50 transition-all hover:shadow-lg cursor-pointer group">
<CardHeader>
<div className="w-12 h-12 bg-purple-100 dark:bg-purple-950 rounded-lg flex items-center justify-center mb-3 group-hover:scale-110 transition-transform">
<Card className="h-full border-2 hover:border-purple-500/50 transition-all duration-300 hover:shadow-xl hover:shadow-purple-500/10 cursor-pointer group relative overflow-hidden">
<div className="absolute top-0 right-0 w-32 h-32 bg-gradient-to-br from-purple-500/10 to-transparent rounded-full blur-2xl -mr-16 -mt-16 group-hover:scale-150 transition-transform duration-500" />
<CardHeader className="relative">
<div className="w-12 h-12 bg-gradient-to-br from-purple-500/20 to-purple-600/10 rounded-xl flex items-center justify-center mb-3 group-hover:scale-110 group-hover:rotate-3 transition-all duration-300 ring-1 ring-purple-500/20">
<MessageSquare className="w-6 h-6 text-purple-600 dark:text-purple-400" />
</div>
<CardTitle className="text-lg">Start Chatting</CardTitle>
@ -109,27 +156,27 @@ export function CompletionStep({ searchSpaceId }: CompletionStepProps) {
Jump right into the AI researcher and start asking questions
</CardDescription>
</CardHeader>
<CardContent className="space-y-4">
<CardContent className="space-y-4 relative">
<div className="space-y-2 text-sm text-muted-foreground">
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>AI-powered conversations</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Research and explore topics</span>
</div>
<div className="flex items-center gap-2">
<CheckCircle className="w-4 h-4 text-green-600" />
<CheckCircle className="w-4 h-4 text-emerald-500" />
<span>Get instant insights</span>
</div>
</div>
<Button
className="w-full group-hover:bg-primary/90"
className="w-full bg-purple-600 hover:bg-purple-700 text-white group-hover:shadow-lg group-hover:shadow-purple-500/25 transition-all duration-300"
onClick={() => router.push(`/dashboard/${searchSpaceId}/researcher`)}
>
Start Chatting
<ArrowRight className="w-4 h-4 ml-2" />
<ArrowRight className="w-4 h-4 ml-2 group-hover:translate-x-1 transition-transform" />
</Button>
</CardContent>
</Card>

View file

@ -16,7 +16,8 @@ const demoPlans = [
"Podcasts support with local TTS providers.",
"Connects with 15+ external sources.",
"Cross-Browser Extension for dynamic webpages including authenticated content",
"Upcoming: Mergeable MindMaps",
"Role-based access permissions",
"Collaboration and multiplayer features",
"Upcoming: Note Management",
],
description: "Open source version with powerful features",
@ -32,9 +33,10 @@ const demoPlans = [
features: [
"Everything in Community",
"Priority Support",
"Role-based access permissions",
"Collaboration and multiplayer features",
"Advanced security features",
"Audit logs and compliance",
"SSO, OIDC & SAML",
"SLA guarantee",
],
description: "For large organizations with specific needs",
buttonText: "Contact Sales",

View file

@ -413,19 +413,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
</div>
</CardHeader>
<CardContent className="space-y-4">
<div className="space-y-2">
<div className="text-sm text-muted-foreground">
<strong>Use cases:</strong> {role.examples}
</div>
<div className="flex flex-wrap gap-1">
{role.characteristics.map((char, idx) => (
<Badge key={idx} variant="outline" className="text-xs">
{char}
</Badge>
))}
</div>
</div>
<div className="space-y-2">
<Label className="text-sm font-medium">Assign LLM Configuration:</Label>
<Select

View file

@ -17,6 +17,7 @@ import {
SquareTerminal,
Trash2,
Undo2,
Users,
} from "lucide-react";
import Image from "next/image";
import Link from "next/link";
@ -54,6 +55,7 @@ export const iconMap: Record<string, LucideIcon> = {
Trash2,
Podcast,
FileText,
Users,
};
const defaultData = {

View file

@ -43,6 +43,7 @@ export function NavMain({ items }: { items: NavItem[] }) {
Podcasts: "podcasts",
Logs: "logs",
Platform: "platform",
Team: "team",
};
const key = titleMap[title];

View file

@ -19,11 +19,14 @@ interface ConnectorsTabProps {
export function ConnectorsTab({ searchSpaceId }: ConnectorsTabProps) {
const t = useTranslations("add_connector");
const [expandedCategories, setExpandedCategories] = useState<string[]>([
"search-engines",
"knowledge-bases",
"web-search",
"messaging",
"project-management",
"team-chats",
"communication",
"documentation",
"development",
"databases",
"productivity",
"web-crawling",
]);
const toggleCategory = (categoryId: string) => {

View file

@ -5,8 +5,21 @@ import type { ConnectorCategory } from "./types";
export const connectorCategories: ConnectorCategory[] = [
{
id: "search-engines",
title: "search_engines",
id: "web-crawling",
title: "web_crawling",
connectors: [
{
id: "webcrawler-connector",
title: "Web Pages",
description: "webcrawler_desc",
icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "web-search",
title: "web_search",
connectors: [
{
id: "tavily-api",
@ -29,13 +42,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.LINKUP_API, "h-6 w-6"),
status: "available",
},
{
id: "elasticsearch-connector",
title: "Elasticsearch",
description: "elasticsearch_desc",
icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "baidu-search-api",
title: "Baidu Search",
@ -46,8 +52,8 @@ export const connectorCategories: ConnectorCategory[] = [
],
},
{
id: "team-chats",
title: "team_chats",
id: "messaging",
title: "messaging",
connectors: [
{
id: "slack-connector",
@ -56,13 +62,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.SLACK_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "ms-teams",
title: "Microsoft Teams",
description: "teams_desc",
icon: <IconBrandWindows className="h-6 w-6" />,
status: "coming-soon",
},
{
id: "discord-connector",
title: "Discord",
@ -70,6 +69,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.DISCORD_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "ms-teams",
title: "Microsoft Teams",
description: "teams_desc",
icon: <IconBrandWindows className="h-6 w-6" />,
status: "coming-soon",
},
],
},
{
@ -100,8 +106,8 @@ export const connectorCategories: ConnectorCategory[] = [
],
},
{
id: "knowledge-bases",
title: "knowledge_bases",
id: "documentation",
title: "documentation",
connectors: [
{
id: "notion-connector",
@ -110,6 +116,19 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.NOTION_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "confluence-connector",
title: "Confluence",
description: "confluence_desc",
icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "development",
title: "development",
connectors: [
{
id: "github-connector",
title: "GitHub",
@ -117,11 +136,17 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GITHUB_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "databases",
title: "databases",
connectors: [
{
id: "confluence-connector",
title: "Confluence",
description: "confluence_desc",
icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
id: "elasticsearch-connector",
title: "Elasticsearch",
description: "elasticsearch_desc",
icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
@ -131,18 +156,11 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.AIRTABLE_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "luma-connector",
title: "Luma",
description: "luma_desc",
icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "communication",
title: "communication",
id: "productivity",
title: "productivity",
connectors: [
{
id: "google-calendar-connector",
@ -158,6 +176,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GOOGLE_GMAIL_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "luma-connector",
title: "Luma",
description: "luma_desc",
icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
id: "zoom",
title: "Zoom",

View file

@ -97,7 +97,7 @@ Before you begin, ensure you have:
| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service |
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service |
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |

View file

@ -17,4 +17,5 @@ export enum EnumConnectorName {
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR",
LUMA_CONNECTOR = "LUMA_CONNECTOR",
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR",
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR",
}

View file

@ -59,11 +59,13 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
return <IconSparkles {...iconProps} />;
case EnumConnectorName.ELASTICSEARCH_CONNECTOR:
return <IconBrandElastic {...iconProps} />;
case EnumConnectorName.WEBCRAWLER_CONNECTOR:
return <Globe {...iconProps} />;
// Additional cases for non-enum connector types
case "YOUTUBE_VIDEO":
return <IconBrandYoutube {...iconProps} />;
case "CRAWLED_URL":
return <Globe {...iconProps} />;
case "YOUTUBE_VIDEO":
return <IconBrandYoutube {...iconProps} />;
case "FILE":
return <File {...iconProps} />;
case "EXTENSION":

View file

@ -1,5 +1,6 @@
export * from "./use-document-by-chunk";
export * from "./use-logs";
export * from "./use-rbac";
export * from "./use-search-source-connectors";
export * from "./use-search-space";
export * from "./use-user";

View file

@ -97,6 +97,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: "",
LUMA_API_KEY: "",
ELASTICSEARCH_API_KEY: "",
FIRECRAWL_API_KEY: "",
INITIAL_URLS: "",
},
});
@ -142,6 +144,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: config.JIRA_API_TOKEN || "",
LUMA_API_KEY: config.LUMA_API_KEY || "",
ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
INITIAL_URLS: config.INITIAL_URLS || "",
});
if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
const savedRepos = config.repo_full_names || [];
@ -469,6 +473,35 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
newConfig = { ELASTICSEARCH_API_KEY: formData.ELASTICSEARCH_API_KEY };
}
break;
case "WEBCRAWLER_CONNECTOR":
if (
formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY ||
formData.INITIAL_URLS !== originalConfig.INITIAL_URLS
) {
newConfig = {};
if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
toast.warning(
"Firecrawl API keys typically start with 'fc-'. Please verify your key."
);
}
newConfig.FIRECRAWL_API_KEY = formData.FIRECRAWL_API_KEY.trim();
} else if (originalConfig.FIRECRAWL_API_KEY) {
toast.info(
"Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback."
);
}
if (formData.INITIAL_URLS !== undefined) {
if (formData.INITIAL_URLS && formData.INITIAL_URLS.trim()) {
newConfig.INITIAL_URLS = formData.INITIAL_URLS.trim();
} else if (originalConfig.INITIAL_URLS) {
toast.info("URLs removed from crawler configuration.");
}
}
}
break;
}
if (newConfig !== null) {
@ -562,6 +595,9 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
"ELASTICSEARCH_API_KEY",
newlySavedConfig.ELASTICSEARCH_API_KEY || ""
);
} else if (connector.connector_type === "WEBCRAWLER_CONNECTOR") {
editForm.setValue("FIRECRAWL_API_KEY", newlySavedConfig.FIRECRAWL_API_KEY || "");
editForm.setValue("INITIAL_URLS", newlySavedConfig.INITIAL_URLS || "");
}
}
if (connector.connector_type === "GITHUB_CONNECTOR") {

View file

@ -0,0 +1,773 @@
"use client";
import { useCallback, useEffect, useMemo, useState } from "react";
import { toast } from "sonner";
// ============ Types ============
export interface Role {
id: number;
name: string;
description: string | null;
permissions: string[];
is_default: boolean;
is_system_role: boolean;
search_space_id: number;
created_at: string;
}
export interface Member {
id: number;
user_id: string;
search_space_id: number;
role_id: number | null;
is_owner: boolean;
joined_at: string;
created_at: string;
role: Role | null;
user_email: string | null;
}
export interface Invite {
id: number;
invite_code: string;
search_space_id: number;
role_id: number | null;
created_by_id: string | null;
expires_at: string | null;
max_uses: number | null;
uses_count: number;
is_active: boolean;
name: string | null;
created_at: string;
role: Role | null;
}
export interface InviteCreate {
name?: string;
role_id?: number;
expires_at?: string;
max_uses?: number;
}
export interface InviteUpdate {
name?: string;
role_id?: number;
expires_at?: string;
max_uses?: number;
is_active?: boolean;
}
export interface RoleCreate {
name: string;
description?: string;
permissions: string[];
is_default?: boolean;
}
export interface RoleUpdate {
name?: string;
description?: string;
permissions?: string[];
is_default?: boolean;
}
export interface PermissionInfo {
value: string;
name: string;
category: string;
}
export interface UserAccess {
search_space_id: number;
search_space_name: string;
is_owner: boolean;
role_name: string | null;
permissions: string[];
}
export interface InviteInfo {
search_space_name: string;
role_name: string | null;
is_valid: boolean;
message: string | null;
}
// ============ Members Hook ============
export function useMembers(searchSpaceId: number) {
const [members, setMembers] = useState<Member[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchMembers = useCallback(async () => {
if (!searchSpaceId) return;
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/members`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "GET",
}
);
if (response.status === 401) {
localStorage.removeItem("surfsense_bearer_token");
window.location.href = "/";
throw new Error("Unauthorized");
}
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch members");
}
const data = await response.json();
setMembers(data);
setError(null);
return data;
} catch (err: any) {
setError(err.message || "Failed to fetch members");
console.error("Error fetching members:", err);
} finally {
setLoading(false);
}
}, [searchSpaceId]);
useEffect(() => {
fetchMembers();
}, [fetchMembers]);
const updateMemberRole = useCallback(
async (membershipId: number, roleId: number | null) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/members/${membershipId}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "PUT",
body: JSON.stringify({ role_id: roleId }),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to update member role");
}
const updatedMember = await response.json();
setMembers((prev) => prev.map((m) => (m.id === membershipId ? updatedMember : m)));
toast.success("Member role updated successfully");
return updatedMember;
} catch (err: any) {
toast.error(err.message || "Failed to update member role");
throw err;
}
},
[searchSpaceId]
);
const removeMember = useCallback(
async (membershipId: number) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/members/${membershipId}`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "DELETE",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to remove member");
}
setMembers((prev) => prev.filter((m) => m.id !== membershipId));
toast.success("Member removed successfully");
return true;
} catch (err: any) {
toast.error(err.message || "Failed to remove member");
return false;
}
},
[searchSpaceId]
);
const leaveSearchSpace = useCallback(async () => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/members/me`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "DELETE",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to leave search space");
}
toast.success("Successfully left the search space");
return true;
} catch (err: any) {
toast.error(err.message || "Failed to leave search space");
return false;
}
}, [searchSpaceId]);
return {
members,
loading,
error,
fetchMembers,
updateMemberRole,
removeMember,
leaveSearchSpace,
};
}
// ============ Roles Hook ============
export function useRoles(searchSpaceId: number) {
const [roles, setRoles] = useState<Role[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchRoles = useCallback(async () => {
if (!searchSpaceId) return;
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/roles`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "GET",
}
);
if (response.status === 401) {
localStorage.removeItem("surfsense_bearer_token");
window.location.href = "/";
throw new Error("Unauthorized");
}
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch roles");
}
const data = await response.json();
setRoles(data);
setError(null);
return data;
} catch (err: any) {
setError(err.message || "Failed to fetch roles");
console.error("Error fetching roles:", err);
} finally {
setLoading(false);
}
}, [searchSpaceId]);
useEffect(() => {
fetchRoles();
}, [fetchRoles]);
const createRole = useCallback(
async (roleData: RoleCreate) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/roles`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "POST",
body: JSON.stringify(roleData),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to create role");
}
const newRole = await response.json();
setRoles((prev) => [...prev, newRole]);
toast.success("Role created successfully");
return newRole;
} catch (err: any) {
toast.error(err.message || "Failed to create role");
throw err;
}
},
[searchSpaceId]
);
const updateRole = useCallback(
async (roleId: number, roleData: RoleUpdate) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/roles/${roleId}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "PUT",
body: JSON.stringify(roleData),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to update role");
}
const updatedRole = await response.json();
setRoles((prev) => prev.map((r) => (r.id === roleId ? updatedRole : r)));
toast.success("Role updated successfully");
return updatedRole;
} catch (err: any) {
toast.error(err.message || "Failed to update role");
throw err;
}
},
[searchSpaceId]
);
const deleteRole = useCallback(
async (roleId: number) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/roles/${roleId}`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "DELETE",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to delete role");
}
setRoles((prev) => prev.filter((r) => r.id !== roleId));
toast.success("Role deleted successfully");
return true;
} catch (err: any) {
toast.error(err.message || "Failed to delete role");
return false;
}
},
[searchSpaceId]
);
return {
roles,
loading,
error,
fetchRoles,
createRole,
updateRole,
deleteRole,
};
}
// ============ Invites Hook ============
export function useInvites(searchSpaceId: number) {
const [invites, setInvites] = useState<Invite[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchInvites = useCallback(async () => {
if (!searchSpaceId) return;
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/invites`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "GET",
}
);
if (response.status === 401) {
localStorage.removeItem("surfsense_bearer_token");
window.location.href = "/";
throw new Error("Unauthorized");
}
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch invites");
}
const data = await response.json();
setInvites(data);
setError(null);
return data;
} catch (err: any) {
setError(err.message || "Failed to fetch invites");
console.error("Error fetching invites:", err);
} finally {
setLoading(false);
}
}, [searchSpaceId]);
useEffect(() => {
fetchInvites();
}, [fetchInvites]);
const createInvite = useCallback(
async (inviteData: InviteCreate) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/invites`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "POST",
body: JSON.stringify(inviteData),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to create invite");
}
const newInvite = await response.json();
setInvites((prev) => [...prev, newInvite]);
toast.success("Invite created successfully");
return newInvite;
} catch (err: any) {
toast.error(err.message || "Failed to create invite");
throw err;
}
},
[searchSpaceId]
);
const updateInvite = useCallback(
async (inviteId: number, inviteData: InviteUpdate) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/invites/${inviteId}`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "PUT",
body: JSON.stringify(inviteData),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to update invite");
}
const updatedInvite = await response.json();
setInvites((prev) => prev.map((i) => (i.id === inviteId ? updatedInvite : i)));
toast.success("Invite updated successfully");
return updatedInvite;
} catch (err: any) {
toast.error(err.message || "Failed to update invite");
throw err;
}
},
[searchSpaceId]
);
const revokeInvite = useCallback(
async (inviteId: number) => {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/invites/${inviteId}`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "DELETE",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to revoke invite");
}
setInvites((prev) => prev.filter((i) => i.id !== inviteId));
toast.success("Invite revoked successfully");
return true;
} catch (err: any) {
toast.error(err.message || "Failed to revoke invite");
return false;
}
},
[searchSpaceId]
);
return {
invites,
loading,
error,
fetchInvites,
createInvite,
updateInvite,
revokeInvite,
};
}
// ============ Permissions Hook ============
export function usePermissions() {
const [permissions, setPermissions] = useState<PermissionInfo[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchPermissions = useCallback(async () => {
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/permissions`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "GET",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch permissions");
}
const data = await response.json();
setPermissions(data.permissions);
setError(null);
return data.permissions;
} catch (err: any) {
setError(err.message || "Failed to fetch permissions");
console.error("Error fetching permissions:", err);
} finally {
setLoading(false);
}
}, []);
useEffect(() => {
fetchPermissions();
}, [fetchPermissions]);
// Group permissions by category
const groupedPermissions = useMemo(() => {
const groups: Record<string, PermissionInfo[]> = {};
for (const perm of permissions) {
if (!groups[perm.category]) {
groups[perm.category] = [];
}
groups[perm.category].push(perm);
}
return groups;
}, [permissions]);
return {
permissions,
groupedPermissions,
loading,
error,
fetchPermissions,
};
}
// ============ User Access Hook ============
export function useUserAccess(searchSpaceId: number) {
const [access, setAccess] = useState<UserAccess | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchAccess = useCallback(async () => {
if (!searchSpaceId) return;
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/searchspaces/${searchSpaceId}/my-access`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "GET",
}
);
if (response.status === 401) {
localStorage.removeItem("surfsense_bearer_token");
window.location.href = "/";
throw new Error("Unauthorized");
}
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch access info");
}
const data = await response.json();
setAccess(data);
setError(null);
return data;
} catch (err: any) {
setError(err.message || "Failed to fetch access info");
console.error("Error fetching access:", err);
} finally {
setLoading(false);
}
}, [searchSpaceId]);
useEffect(() => {
fetchAccess();
}, [fetchAccess]);
// Helper function to check if user has a specific permission
const hasPermission = useCallback(
(permission: string) => {
if (!access) return false;
// Owner/full access check
if (access.permissions.includes("*")) return true;
return access.permissions.includes(permission);
},
[access]
);
// Helper function to check if user has any of the given permissions
const hasAnyPermission = useCallback(
(permissions: string[]) => {
if (!access) return false;
if (access.permissions.includes("*")) return true;
return permissions.some((p) => access.permissions.includes(p));
},
[access]
);
return {
access,
loading,
error,
fetchAccess,
hasPermission,
hasAnyPermission,
};
}
// ============ Invite Info Hook (Public) ============
export function useInviteInfo(inviteCode: string | null) {
const [inviteInfo, setInviteInfo] = useState<InviteInfo | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const fetchInviteInfo = useCallback(async () => {
if (!inviteCode) {
setLoading(false);
return;
}
try {
setLoading(true);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/invites/${inviteCode}/info`,
{
method: "GET",
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to fetch invite info");
}
const data = await response.json();
setInviteInfo(data);
setError(null);
return data;
} catch (err: any) {
setError(err.message || "Failed to fetch invite info");
console.error("Error fetching invite info:", err);
} finally {
setLoading(false);
}
}, [inviteCode]);
useEffect(() => {
fetchInviteInfo();
}, [fetchInviteInfo]);
const acceptInvite = useCallback(async () => {
if (!inviteCode) {
toast.error("No invite code provided");
return null;
}
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/invites/accept`,
{
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
method: "POST",
body: JSON.stringify({ invite_code: inviteCode }),
}
);
if (!response.ok) {
const errorData = await response.json().catch(() => ({}));
throw new Error(errorData.detail || "Failed to accept invite");
}
const data = await response.json();
toast.success(data.message || "Successfully joined the search space");
return data;
} catch (err: any) {
toast.error(err.message || "Failed to accept invite");
throw err;
}
}, [inviteCode]);
return {
inviteInfo,
loading,
error,
fetchInviteInfo,
acceptInvite,
};
}

View file

@ -10,6 +10,8 @@ interface SearchSpace {
created_at: string;
citations_enabled: boolean;
qna_custom_instructions: string | null;
member_count: number;
is_owner: boolean;
}
export function useSearchSpaces() {

View file

@ -18,6 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable",
LUMA_CONNECTOR: "Luma",
ELASTICSEARCH_CONNECTOR: "Elasticsearch",
WEBCRAWLER_CONNECTOR: "Web Pages",
};
return typeMap[type] || type;
};

View file

@ -103,6 +103,7 @@
"surfsense_dashboard": "SurfSense Dashboard",
"welcome_message": "Welcome to your SurfSense dashboard.",
"your_search_spaces": "Your Search Spaces",
"shared": "Shared",
"create_search_space": "Create Search Space",
"add_new_search_space": "Add New Search Space",
"loading": "Loading",
@ -149,7 +150,8 @@
"podcasts": "Podcasts",
"logs": "Logs",
"all_search_spaces": "All Search Spaces",
"chat": "Chat"
"chat": "Chat",
"team": "Team"
},
"pricing": {
"title": "SurfSense Pricing",
@ -304,11 +306,14 @@
"add_connector": {
"title": "Connect Your Tools",
"subtitle": "Integrate with your favorite services to enhance your research capabilities.",
"search_engines": "Search Engines",
"team_chats": "Team Chats",
"web_search": "Web Search",
"messaging": "Messaging",
"project_management": "Project Management",
"knowledge_bases": "Knowledge Bases",
"communication": "Communication",
"documentation": "Documentation",
"development": "Development",
"databases": "Databases",
"productivity": "Productivity",
"web_crawling": "Web Crawling",
"connect": "Connect",
"coming_soon": "Coming Soon",
"connected": "Connected",
@ -328,10 +333,11 @@
"github_desc": "Connect a GitHub PAT to index code and docs from accessible repositories.",
"confluence_desc": "Connect to Confluence to search pages, comments and documentation.",
"airtable_desc": "Connect to Airtable to search records, tables and database content.",
"luma_desc": "Connect to Luma to search events",
"luma_desc": "Connect to Luma to search events, meetups and gatherings.",
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
"gmail_desc": "Connect to your Gmail account to search through your emails.",
"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts."
"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
"webcrawler_desc": "Crawl and index content from any public web pages."
},
"upload_documents": {
"title": "Upload Documents",

View file

@ -103,6 +103,7 @@
"surfsense_dashboard": "SurfSense 仪表盘",
"welcome_message": "欢迎来到您的 SurfSense 仪表盘。",
"your_search_spaces": "您的搜索空间",
"shared": "共享",
"create_search_space": "创建搜索空间",
"add_new_search_space": "添加新的搜索空间",
"loading": "加载中",
@ -149,7 +150,8 @@
"podcasts": "播客",
"logs": "日志",
"all_search_spaces": "所有搜索空间",
"chat": "聊天"
"chat": "聊天",
"team": "团队"
},
"pricing": {
"title": "SurfSense 定价",
@ -304,11 +306,14 @@
"add_connector": {
"title": "连接您的工具",
"subtitle": "集成您喜欢的服务以增强研究能力。",
"search_engines": "搜索引擎",
"team_chats": "团队聊天",
"web_search": "网络搜索",
"messaging": "即时通讯",
"project_management": "项目管理",
"knowledge_bases": "知识库",
"communication": "通讯",
"documentation": "文档协作",
"development": "开发工具",
"databases": "数据库",
"productivity": "效率工具",
"web_crawling": "网页爬取",
"connect": "连接",
"coming_soon": "即将推出",
"connected": "已连接",
@ -328,10 +333,11 @@
"github_desc": "连接 GitHub PAT 以索引可访问存储库的代码和文档。",
"confluence_desc": "连接到 Confluence 以搜索页面、评论和文档。",
"airtable_desc": "连接到 Airtable 以搜索记录、表格和数据库内容。",
"luma_desc": "连接到 Luma 以搜索活动",
"luma_desc": "连接到 Luma 以搜索活动、聚会和集会。",
"calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。",
"gmail_desc": "连接到您的 Gmail 账户以搜索您的电子邮件。",
"zoom_desc": "连接到 Zoom 以访问会议录制和转录。"
"zoom_desc": "连接到 Zoom 以访问会议录制和转录。",
"webcrawler_desc": "爬取和索引任何公开网页的内容。"
},
"upload_documents": {
"title": "上传文档",