diff --git a/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py b/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py new file mode 100644 index 000000000..85d34c4f2 --- /dev/null +++ b/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py @@ -0,0 +1,76 @@ +"""Add pg_trgm indexes for efficient document title search + +Revision ID: 67 +Revises: 66 + +Adds the pg_trgm extension and GIN trigram indexes on documents.title +to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%'). + +Indexes added: +1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%' +2. idx_documents_search_space_id - B-tree on search_space_id for filtering +3. idx_documents_search_space_updated - Composite for recent docs query (covering index) +4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title + +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "67" +down_revision: str | None = "66" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Add pg_trgm extension and optimized indexes for document search.""" + + # Create pg_trgm extension if not exists + # This extension provides trigram-based text similarity functions and operators + op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;") + + # 1. GIN trigram index on documents.title for ILIKE '%term%' searches + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_title_trgm + ON documents USING gin (title gin_trgm_ops); + """ + ) + + # 2. B-tree index on search_space_id for fast filtering + # (Every query filters by search_space_id first) + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_search_space_id + ON documents (search_space_id); + """ + ) + + # 3. Covering index for "recent documents" query (no search term) + # Includes id, title, document_type so PostgreSQL can do index-only scan + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated + ON documents (search_space_id, updated_at DESC NULLS LAST) + INCLUDE (id, title, document_type); + """ + ) + + # 4. GIN trigram index on surfsense_docs_documents.title + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm + ON surfsense_docs_documents USING gin (title gin_trgm_ops); + """ + ) + + +def downgrade() -> None: + """Remove all document search indexes (extension is left in place).""" + op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;") + op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;") + op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;") + op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 9b245ba44..ee0d0724d 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1007,11 +1007,36 @@ async def setup_indexes(): "CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))" ) ) + # pg_trgm indexes for efficient ILIKE '%term%' searches on titles + # Critical for document mention picker (@mentions) to scale + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)" + ) + ) + # B-tree index on search_space_id for fast filtering + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)" + ) + ) + # Covering index for "recent documents" query - enables index-only scan + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)" + ) + ) async def create_db_and_tables(): async with engine.begin() as conn: await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm")) await conn.run_sync(Base.metadata.create_all) await setup_indexes() diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index d06217fc4..be90df459 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -19,6 +19,8 @@ from app.db import ( from app.schemas import ( DocumentRead, DocumentsCreate, + DocumentTitleRead, + DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, PaginatedResponse, @@ -429,6 +431,112 @@ async def search_documents( ) from e +@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse) +async def search_document_titles( + search_space_id: int, + title: str = "", + page: int = 0, + page_size: int = 20, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Lightweight document title search optimized for mention picker (@mentions). + + Returns only id, title, and document_type - no content or metadata. + Uses pg_trgm fuzzy search with similarity scoring for typo tolerance. + Results are ordered by relevance using trigram similarity scores. + + Args: + search_space_id: The search space to search in. Required. + title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents. + page: Zero-based page index. Default: 0. + page_size: Number of items per page. Default: 20. + session: Database session (injected). + user: Current authenticated user (injected). + + Returns: + DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count). + """ + from sqlalchemy import desc, func, or_ + + try: + # Check permission for the search space + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + # Base query - only select lightweight fields + query = select( + Document.id, + Document.title, + Document.document_type, + ).filter(Document.search_space_id == search_space_id) + + # If query is too short, return recent documents ordered by updated_at + if len(title.strip()) < 2: + query = query.order_by(Document.updated_at.desc().nullslast()) + else: + # Fuzzy search using pg_trgm similarity + ILIKE fallback + search_term = title.strip() + + # Similarity threshold for fuzzy matching (0.3 = ~30% trigram overlap) + # Lower values = more fuzzy, higher values = stricter matching + similarity_threshold = 0.3 + + # Match documents that either: + # 1. Have high trigram similarity (fuzzy match - handles typos) + # 2. Contain the exact substring (ILIKE - handles partial matches) + query = query.filter( + or_( + func.similarity(Document.title, search_term) > similarity_threshold, + Document.title.ilike(f"%{search_term}%"), + ) + ) + + # Order by similarity score (descending) for best relevance ranking + # Higher similarity = better match = appears first + query = query.order_by( + desc(func.similarity(Document.title, search_term)), + Document.title, # Alphabetical tiebreaker + ) + + # Fetch page_size + 1 to determine has_more without COUNT query + offset = page * page_size + result = await session.execute(query.offset(offset).limit(page_size + 1)) + rows = result.all() + + # Check if there are more results + has_more = len(rows) > page_size + items = rows[:page_size] # Only return requested page_size + + # Convert to response format + api_documents = [ + DocumentTitleRead( + id=row.id, + title=row.title, + document_type=row.document_type, + ) + for row in items + ] + + return DocumentTitleSearchResponse( + items=api_documents, + has_more=has_more, + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to search document titles: {e!s}" + ) from e + + @router.get("/documents/type-counts") async def get_document_type_counts( search_space_id: int | None = None, diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index 017c78577..6c9577c46 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -4,6 +4,8 @@ from .documents import ( DocumentBase, DocumentRead, DocumentsCreate, + DocumentTitleRead, + DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, ExtensionDocumentContent, @@ -85,6 +87,8 @@ __all__ = [ # Document schemas "DocumentBase", "DocumentRead", + "DocumentTitleRead", + "DocumentTitleSearchResponse", "DocumentUpdate", "DocumentWithChunksRead", "DocumentsCreate", diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index e1e8b9248..2b4bda0ca 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel): page: int page_size: int has_more: bool + + +class DocumentTitleRead(BaseModel): + """Lightweight document response for mention picker - only essential fields.""" + + id: int + title: str + document_type: DocumentType + + model_config = ConfigDict(from_attributes=True) + + +class DocumentTitleSearchResponse(BaseModel): + """Response for document title search - optimized for typeahead.""" + + items: list[DocumentTitleRead] + has_more: bool diff --git a/surfsense_web/components/assistant-ui/inline-mention-editor.tsx b/surfsense_web/components/assistant-ui/inline-mention-editor.tsx index f35019216..570440f6a 100644 --- a/surfsense_web/components/assistant-ui/inline-mention-editor.tsx +++ b/surfsense_web/components/assistant-ui/inline-mention-editor.tsx @@ -12,6 +12,7 @@ import { } from "react"; import ReactDOMServer from "react-dom/server"; import type { Document } from "@/contracts/types/document.types"; +import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { cn } from "@/lib/utils"; export interface MentionedDocument { @@ -166,12 +167,19 @@ export const InlineMentionEditor = forwardRef { }; const Composer: FC = () => { - // ---- State for document mentions (using atoms to persist across remounts) ---- + // Document mention state (atoms persist across component remounts) const [mentionedDocuments, setMentionedDocuments] = useAtom(mentionedDocumentsAtom); const [showDocumentPopover, setShowDocumentPopover] = useState(false); const [mentionQuery, setMentionQuery] = useState(""); @@ -212,16 +212,12 @@ const Composer: FC = () => { const composerRuntime = useComposerRuntime(); const hasAutoFocusedRef = useRef(false); - // Check if thread is empty (new chat) const isThreadEmpty = useAssistantState(({ thread }) => thread.isEmpty); - - // Check if thread is currently running (streaming response) const isThreadRunning = useAssistantState(({ thread }) => thread.isRunning); - // Auto-focus editor when on new chat page + // Auto-focus editor on new chat page after mount useEffect(() => { if (isThreadEmpty && !hasAutoFocusedRef.current && editorRef.current) { - // Small delay to ensure the editor is fully mounted const timeoutId = setTimeout(() => { editorRef.current?.focus(); hasAutoFocusedRef.current = true; @@ -230,7 +226,7 @@ const Composer: FC = () => { } }, [isThreadEmpty]); - // Sync mentioned document IDs to atom for use in chat request + // Sync mentioned document IDs to atom for inclusion in chat request payload useEffect(() => { setMentionedDocumentIds({ surfsense_doc_ids: mentionedDocuments @@ -242,7 +238,7 @@ const Composer: FC = () => { }); }, [mentionedDocuments, setMentionedDocumentIds]); - // Handle text change from inline editor - sync with assistant-ui composer + // Sync editor text with assistant-ui composer runtime const handleEditorChange = useCallback( (text: string) => { composerRuntime.setText(text); @@ -250,13 +246,13 @@ const Composer: FC = () => { [composerRuntime] ); - // Handle @ mention trigger from inline editor + // Open document picker when @ mention is triggered const handleMentionTrigger = useCallback((query: string) => { setShowDocumentPopover(true); setMentionQuery(query); }, []); - // Handle mention close + // Close document picker and reset query const handleMentionClose = useCallback(() => { if (showDocumentPopover) { setShowDocumentPopover(false); @@ -264,7 +260,7 @@ const Composer: FC = () => { } }, [showDocumentPopover]); - // Handle keyboard navigation when popover is open + // Keyboard navigation for document picker (arrow keys, Enter, Escape) const handleKeyDown = useCallback( (e: React.KeyboardEvent) => { if (showDocumentPopover) { @@ -294,15 +290,13 @@ const Composer: FC = () => { [showDocumentPopover] ); - // Handle submit from inline editor (Enter key) + // Submit message (blocked during streaming or when document picker is open) const handleSubmit = useCallback(() => { - // Prevent sending while a response is still streaming if (isThreadRunning) { return; } if (!showDocumentPopover) { composerRuntime.send(); - // Clear the editor after sending editorRef.current?.clear(); setMentionedDocuments([]); setMentionedDocumentIds({ @@ -318,6 +312,7 @@ const Composer: FC = () => { setMentionedDocumentIds, ]); + // Remove document from mentions and sync IDs to atom const handleDocumentRemove = useCallback( (docId: number, docType?: string) => { setMentionedDocuments((prev) => { @@ -336,6 +331,7 @@ const Composer: FC = () => { [setMentionedDocuments, setMentionedDocumentIds] ); + // Add selected documents from picker, insert chips, and sync IDs to atom const handleDocumentsMention = useCallback( (documents: Pick[]) => { const existingKeys = new Set(mentionedDocuments.map((d) => `${d.document_type}:${d.id}`)); @@ -373,7 +369,7 @@ const Composer: FC = () => { - {/* -------- Inline Mention Editor -------- */} + {/* Inline editor with @mention support */}
{ />
- {/* -------- Document mention popover (rendered via portal) -------- */} + {/* Document picker popover (portal to body for proper z-index stacking) */} {showDocumentPopover && typeof document !== "undefined" && createPortal( - <> - {/* Backdrop */} -