Merge pull request #707 from AnishSarkar22/fix/mentions

feat: Revamped search in document mentions
2026-07-04 22:02:16 +02:00 · 2026-01-18 22:15:28 -08:00 · 2026-01-18 22:15:28 -08:00 · 26ef83fbaa
commit 26ef83fbaa
parent 87a174a1fd b158ddd083
11 changed files with 560 additions and 168 deletions
--- a/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py
+++ b/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py
@ -0,0 +1,76 @@
+"""Add pg_trgm indexes for efficient document title search
+
+Revision ID: 67
+Revises: 66
+
+Adds the pg_trgm extension and GIN trigram indexes on documents.title
+to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%').
+
+Indexes added:
+1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%'
+2. idx_documents_search_space_id - B-tree on search_space_id for filtering
+3. idx_documents_search_space_updated - Composite for recent docs query (covering index)
+4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title
+
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "67"
+down_revision: str | None = "66"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Add pg_trgm extension and optimized indexes for document search."""
+
+    # Create pg_trgm extension if not exists
+    # This extension provides trigram-based text similarity functions and operators
+    op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
+
+    # 1. GIN trigram index on documents.title for ILIKE '%term%' searches
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_documents_title_trgm 
+        ON documents USING gin (title gin_trgm_ops);
+        """
+    )
+
+    # 2. B-tree index on search_space_id for fast filtering
+    # (Every query filters by search_space_id first)
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_documents_search_space_id 
+        ON documents (search_space_id);
+        """
+    )
+
+    # 3. Covering index for "recent documents" query (no search term)
+    # Includes id, title, document_type so PostgreSQL can do index-only scan
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated 
+        ON documents (search_space_id, updated_at DESC NULLS LAST)
+        INCLUDE (id, title, document_type);
+        """
+    )
+
+    # 4. GIN trigram index on surfsense_docs_documents.title
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm 
+        ON surfsense_docs_documents USING gin (title gin_trgm_ops);
+        """
+    )
+
+
+def downgrade() -> None:
+    """Remove all document search indexes (extension is left in place)."""
+    op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;")
+    op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;")
+    op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;")
+    op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;")
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -1007,11 +1007,36 @@ async def setup_indexes():
                "CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))"
            )
        )
+        # pg_trgm indexes for efficient ILIKE '%term%' searches on titles
+        # Critical for document mention picker (@mentions) to scale
+        await conn.execute(
+            text(
+                "CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)"
+            )
+        )
+        # B-tree index on search_space_id for fast filtering
+        await conn.execute(
+            text(
+                "CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)"
+            )
+        )
+        # Covering index for "recent documents" query - enables index-only scan
+        await conn.execute(
+            text(
+                "CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)"
+            )
+        )
+        await conn.execute(
+            text(
+                "CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)"
+            )
+        )


 async def create_db_and_tables():
    async with engine.begin() as conn:
        await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
+        await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
        await conn.run_sync(Base.metadata.create_all)
    await setup_indexes()

--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -19,6 +19,8 @@ from app.db import (
 from app.schemas import (
    DocumentRead,
    DocumentsCreate,
+    DocumentTitleRead,
+    DocumentTitleSearchResponse,
    DocumentUpdate,
    DocumentWithChunksRead,
    PaginatedResponse,
@ -429,6 +431,112 @@ async def search_documents(
        ) from e


+@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse)
+async def search_document_titles(
+    search_space_id: int,
+    title: str = "",
+    page: int = 0,
+    page_size: int = 20,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Lightweight document title search optimized for mention picker (@mentions).
+
+    Returns only id, title, and document_type - no content or metadata.
+    Uses pg_trgm fuzzy search with similarity scoring for typo tolerance.
+    Results are ordered by relevance using trigram similarity scores.
+
+    Args:
+        search_space_id: The search space to search in. Required.
+        title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents.
+        page: Zero-based page index. Default: 0.
+        page_size: Number of items per page. Default: 20.
+        session: Database session (injected).
+        user: Current authenticated user (injected).
+
+    Returns:
+        DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count).
+    """
+    from sqlalchemy import desc, func, or_
+
+    try:
+        # Check permission for the search space
+        await check_permission(
+            session,
+            user,
+            search_space_id,
+            Permission.DOCUMENTS_READ.value,
+            "You don't have permission to read documents in this search space",
+        )
+
+        # Base query - only select lightweight fields
+        query = select(
+            Document.id,
+            Document.title,
+            Document.document_type,
+        ).filter(Document.search_space_id == search_space_id)
+
+        # If query is too short, return recent documents ordered by updated_at
+        if len(title.strip()) < 2:
+            query = query.order_by(Document.updated_at.desc().nullslast())
+        else:
+            # Fuzzy search using pg_trgm similarity + ILIKE fallback
+            search_term = title.strip()
+
+            # Similarity threshold for fuzzy matching (0.3 = ~30% trigram overlap)
+            # Lower values = more fuzzy, higher values = stricter matching
+            similarity_threshold = 0.3
+
+            # Match documents that either:
+            # 1. Have high trigram similarity (fuzzy match - handles typos)
+            # 2. Contain the exact substring (ILIKE - handles partial matches)
+            query = query.filter(
+                or_(
+                    func.similarity(Document.title, search_term) > similarity_threshold,
+                    Document.title.ilike(f"%{search_term}%"),
+                )
+            )
+
+            # Order by similarity score (descending) for best relevance ranking
+            # Higher similarity = better match = appears first
+            query = query.order_by(
+                desc(func.similarity(Document.title, search_term)),
+                Document.title,  # Alphabetical tiebreaker
+            )
+
+        # Fetch page_size + 1 to determine has_more without COUNT query
+        offset = page * page_size
+        result = await session.execute(query.offset(offset).limit(page_size + 1))
+        rows = result.all()
+
+        # Check if there are more results
+        has_more = len(rows) > page_size
+        items = rows[:page_size]  # Only return requested page_size
+
+        # Convert to response format
+        api_documents = [
+            DocumentTitleRead(
+                id=row.id,
+                title=row.title,
+                document_type=row.document_type,
+            )
+            for row in items
+        ]
+
+        return DocumentTitleSearchResponse(
+            items=api_documents,
+            has_more=has_more,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to search document titles: {e!s}"
+        ) from e
+
+
@router.get("/documents/type-counts")
 async def get_document_type_counts(
    search_space_id: int | None = None,
--- a/surfsense_backend/app/schemas/init.py
+++ b/surfsense_backend/app/schemas/init.py
@ -4,6 +4,8 @@ from .documents import (
    DocumentBase,
    DocumentRead,
    DocumentsCreate,
+    DocumentTitleRead,
+    DocumentTitleSearchResponse,
    DocumentUpdate,
    DocumentWithChunksRead,
    ExtensionDocumentContent,
@ -85,6 +87,8 @@ __all__ = [
    # Document schemas
    "DocumentBase",
    "DocumentRead",
+    "DocumentTitleRead",
+    "DocumentTitleSearchResponse",
    "DocumentUpdate",
    "DocumentWithChunksRead",
    "DocumentsCreate",
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel):
    page: int
    page_size: int
    has_more: bool
+
+
+class DocumentTitleRead(BaseModel):
+    """Lightweight document response for mention picker - only essential fields."""
+
+    id: int
+    title: str
+    document_type: DocumentType
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+class DocumentTitleSearchResponse(BaseModel):
+    """Response for document title search - optimized for typeahead."""
+
+    items: list[DocumentTitleRead]
+    has_more: bool