diff --git a/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py b/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py new file mode 100644 index 000000000..ba9cbdbac --- /dev/null +++ b/surfsense_backend/alembic/versions/67_add_pg_trgm_index_for_document_title_search.py @@ -0,0 +1,77 @@ +"""Add pg_trgm indexes for efficient document title search + +Revision ID: 67 +Revises: 66 + +Adds the pg_trgm extension and GIN trigram indexes on documents.title +to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%'). + +Indexes added: +1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%' +2. idx_documents_search_space_id - B-tree on search_space_id for filtering +3. idx_documents_search_space_updated - Composite for recent docs query (covering index) +4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title + +This is critical for the document mention picker (@mentions) to scale to 10,000+ documents. +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "67" +down_revision: str | None = "66" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Add pg_trgm extension and optimized indexes for document search.""" + + # Create pg_trgm extension if not exists + # This extension provides trigram-based text similarity functions and operators + op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;") + + # 1. GIN trigram index on documents.title for ILIKE '%term%' searches + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_title_trgm + ON documents USING gin (title gin_trgm_ops); + """ + ) + + # 2. B-tree index on search_space_id for fast filtering + # (Every query filters by search_space_id first) + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_search_space_id + ON documents (search_space_id); + """ + ) + + # 3. Covering index for "recent documents" query (no search term) + # Includes id, title, document_type so PostgreSQL can do index-only scan + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated + ON documents (search_space_id, updated_at DESC NULLS LAST) + INCLUDE (id, title, document_type); + """ + ) + + # 4. GIN trigram index on surfsense_docs_documents.title + op.execute( + """ + CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm + ON surfsense_docs_documents USING gin (title gin_trgm_ops); + """ + ) + + +def downgrade() -> None: + """Remove all document search indexes (extension is left in place).""" + op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;") + op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;") + op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;") + op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 9b245ba44..ee0d0724d 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1007,11 +1007,36 @@ async def setup_indexes(): "CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))" ) ) + # pg_trgm indexes for efficient ILIKE '%term%' searches on titles + # Critical for document mention picker (@mentions) to scale + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)" + ) + ) + # B-tree index on search_space_id for fast filtering + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)" + ) + ) + # Covering index for "recent documents" query - enables index-only scan + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)" + ) + ) + await conn.execute( + text( + "CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)" + ) + ) async def create_db_and_tables(): async with engine.begin() as conn: await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm")) await conn.run_sync(Base.metadata.create_all) await setup_indexes() diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index d06217fc4..a1a421b8a 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -19,6 +19,8 @@ from app.db import ( from app.schemas import ( DocumentRead, DocumentsCreate, + DocumentTitleRead, + DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, PaginatedResponse, @@ -429,6 +431,99 @@ async def search_documents( ) from e +@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse) +async def search_document_titles( + search_space_id: int, + title: str = "", + page: int = 0, + page_size: int = 20, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Lightweight document title search optimized for mention picker (@mentions). + + Returns only id, title, and document_type - no content or metadata. + Results are ordered by relevance: prefix matches first, then contains matches. + + Args: + search_space_id: The search space to search in. Required. + title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents. + page: Zero-based page index. Default: 0. + page_size: Number of items per page. Default: 20. + session: Database session (injected). + user: Current authenticated user (injected). + + Returns: + DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count). + """ + from sqlalchemy import case, literal + + try: + # Check permission for the search space + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + # Base query - only select lightweight fields + query = select( + Document.id, + Document.title, + Document.document_type, + ).filter(Document.search_space_id == search_space_id) + + # If query is too short, return recent documents ordered by updated_at + if len(title.strip()) < 2: + query = query.order_by(Document.updated_at.desc().nullslast()) + else: + # Apply title filter with ILIKE (uses pg_trgm index) + search_term = title.strip() + query = query.filter(Document.title.ilike(f"%{search_term}%")) + + # Order by relevance: prefix matches first, then alphabetical + # CASE WHEN title ILIKE 'term%' THEN 0 ELSE 1 END + prefix_priority = case( + (Document.title.ilike(f"{search_term}%"), literal(0)), + else_=literal(1), + ) + query = query.order_by(prefix_priority, Document.title) + + # Fetch page_size + 1 to determine has_more without COUNT query + offset = page * page_size + result = await session.execute(query.offset(offset).limit(page_size + 1)) + rows = result.all() + + # Check if there are more results + has_more = len(rows) > page_size + items = rows[:page_size] # Only return requested page_size + + # Convert to response format + api_documents = [ + DocumentTitleRead( + id=row.id, + title=row.title, + document_type=row.document_type, + ) + for row in items + ] + + return DocumentTitleSearchResponse( + items=api_documents, + has_more=has_more, + ) + + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to search document titles: {e!s}" + ) from e + + @router.get("/documents/type-counts") async def get_document_type_counts( search_space_id: int | None = None, diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index 076ac5915..b9b371bc5 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -4,6 +4,8 @@ from .documents import ( DocumentBase, DocumentRead, DocumentsCreate, + DocumentTitleRead, + DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, ExtensionDocumentContent, @@ -85,6 +87,8 @@ __all__ = [ # Document schemas "DocumentBase", "DocumentRead", + "DocumentTitleRead", + "DocumentTitleSearchResponse", "DocumentUpdate", "DocumentWithChunksRead", "DocumentsCreate", diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index e1e8b9248..2b4bda0ca 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel): page: int page_size: int has_more: bool + + +class DocumentTitleRead(BaseModel): + """Lightweight document response for mention picker - only essential fields.""" + + id: int + title: str + document_type: DocumentType + + model_config = ConfigDict(from_attributes=True) + + +class DocumentTitleSearchResponse(BaseModel): + """Response for document title search - optimized for typeahead.""" + + items: list[DocumentTitleRead] + has_more: bool diff --git a/surfsense_web/components/new-chat/document-mention-picker.tsx b/surfsense_web/components/new-chat/document-mention-picker.tsx index f7f948d41..66988adcc 100644 --- a/surfsense_web/components/new-chat/document-mention-picker.tsx +++ b/surfsense_web/components/new-chat/document-mention-picker.tsx @@ -1,6 +1,6 @@ "use client"; -import { useQuery } from "@tanstack/react-query"; +import { useQuery, useQueryClient } from "@tanstack/react-query"; import { FileText } from "lucide-react"; import { forwardRef, @@ -12,9 +12,8 @@ import { useState, } from "react"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; -import type { Document, GetDocumentsResponse } from "@/contracts/types/document.types"; +import type { Document, SearchDocumentTitlesResponse } from "@/contracts/types/document.types"; import { documentsApiService } from "@/lib/apis/documents-api.service"; -import { cacheKeys } from "@/lib/query-client/cache-keys"; import { cn } from "@/lib/utils"; export interface DocumentMentionPickerRef { @@ -32,14 +31,45 @@ interface DocumentMentionPickerProps { } const PAGE_SIZE = 20; +const MIN_SEARCH_LENGTH = 2; +const THROTTLE_MS = 200; + +/** + * Throttle hook - fires immediately, then at most once per interval + * Better than debounce for typeahead: user sees results updating as they type + */ +function useThrottled(value: T, delay = THROTTLE_MS) { + const [throttled, setThrottled] = useState(value); + const lastExecuted = useRef(Date.now()); + const timeoutRef = useRef>(); -function useDebounced(value: T, delay = 300) { - const [debounced, setDebounced] = useState(value); useEffect(() => { - const t = setTimeout(() => setDebounced(value), delay); - return () => clearTimeout(t); + const now = Date.now(); + const elapsed = now - lastExecuted.current; + + if (elapsed >= delay) { + // Enough time has passed, update immediately + lastExecuted.current = now; + setThrottled(value); + } else { + // Schedule update for remaining time + if (timeoutRef.current) { + clearTimeout(timeoutRef.current); + } + timeoutRef.current = setTimeout(() => { + lastExecuted.current = Date.now(); + setThrottled(value); + }, delay - elapsed); + } + + return () => { + if (timeoutRef.current) { + clearTimeout(timeoutRef.current); + } + }; }, [value, delay]); - return debounced; + + return throttled; } export const DocumentMentionPicker = forwardRef< @@ -49,9 +79,11 @@ export const DocumentMentionPicker = forwardRef< { searchSpaceId, onSelectionChange, onDone, initialSelectedDocuments = [], externalSearch = "" }, ref ) { - // Use external search + const queryClient = useQueryClient(); + + // Use external search with throttle (not debounce) for responsive feel const search = externalSearch; - const debouncedSearch = useDebounced(search, 150); + const throttledSearch = useThrottled(search, THROTTLE_MS); const [highlightedIndex, setHighlightedIndex] = useState(0); const itemRefs = useRef>(new Map()); const scrollContainerRef = useRef(null); @@ -64,6 +96,38 @@ export const DocumentMentionPicker = forwardRef< const [hasMore, setHasMore] = useState(false); const [isLoadingMore, setIsLoadingMore] = useState(false); + // Check if search is long enough + const isSearchValid = throttledSearch.trim().length >= MIN_SEARCH_LENGTH; + const shouldSearch = throttledSearch.trim().length > 0; + + // Prefetch first page when picker mounts - results appear instantly + useEffect(() => { + if (!searchSpaceId) return; + + const prefetchParams = { + search_space_id: searchSpaceId, + page: 0, + page_size: PAGE_SIZE, + }; + + // Prefetch document titles (user docs) + queryClient.prefetchQuery({ + queryKey: ["document-titles", prefetchParams], + queryFn: () => documentsApiService.searchDocumentTitles({ queryParams: prefetchParams }), + staleTime: 60 * 1000, + }); + + // Prefetch SurfSense docs + queryClient.prefetchQuery({ + queryKey: ["surfsense-docs-mention", "", false], + queryFn: () => + documentsApiService.getSurfsenseDocs({ + queryParams: { page: 0, page_size: PAGE_SIZE }, + }), + staleTime: 3 * 60 * 1000, + }); + }, [searchSpaceId, queryClient]); + // Reset pagination when search or search space changes // biome-ignore lint/correctness/useExhaustiveDependencies: intentionally reset pagination when search/space changes useEffect(() => { @@ -71,59 +135,44 @@ export const DocumentMentionPicker = forwardRef< setCurrentPage(0); setHasMore(false); setHighlightedIndex(0); - }, [debouncedSearch, searchSpaceId]); + }, [throttledSearch, searchSpaceId]); - // Query params for initial fetch (page 0) - const fetchQueryParams = useMemo( + // Query params for lightweight title search + const titleSearchParams = useMemo( () => ({ search_space_id: searchSpaceId, page: 0, page_size: PAGE_SIZE, + ...(isSearchValid ? { title: throttledSearch.trim() } : {}), }), - [searchSpaceId] + [searchSpaceId, throttledSearch, isSearchValid] ); - const searchQueryParams = useMemo(() => { - return { - search_space_id: searchSpaceId, - page: 0, - page_size: PAGE_SIZE, - title: debouncedSearch, - }; - }, [debouncedSearch, searchSpaceId]); - const surfsenseDocsQueryParams = useMemo(() => { const params: { page: number; page_size: number; title?: string } = { page: 0, page_size: PAGE_SIZE, }; - if (debouncedSearch.trim()) { - params.title = debouncedSearch; + if (isSearchValid) { + params.title = throttledSearch.trim(); } return params; - }, [debouncedSearch]); + }, [throttledSearch, isSearchValid]); - // Use query for fetching first page of documents - const { data: documents, isLoading: isDocumentsLoading } = useQuery({ - queryKey: cacheKeys.documents.withQueryParams(fetchQueryParams), - queryFn: () => documentsApiService.getDocuments({ queryParams: fetchQueryParams }), - staleTime: 3 * 60 * 1000, - enabled: !!searchSpaceId && !debouncedSearch.trim() && currentPage === 0, - }); - - // Searching - first page - const { data: searchedDocuments, isLoading: isSearchedDocumentsLoading } = useQuery({ - queryKey: cacheKeys.documents.withQueryParams(searchQueryParams), - queryFn: () => documentsApiService.searchDocuments({ queryParams: searchQueryParams }), - staleTime: 3 * 60 * 1000, - enabled: !!searchSpaceId && !!debouncedSearch.trim() && currentPage === 0, + // Use the new lightweight endpoint for document title search + const { data: titleSearchResults, isLoading: isTitleSearchLoading } = useQuery({ + queryKey: ["document-titles", titleSearchParams], + queryFn: () => documentsApiService.searchDocumentTitles({ queryParams: titleSearchParams }), + staleTime: 60 * 1000, // 1 minute - shorter for fresher results + enabled: !!searchSpaceId && currentPage === 0 && (!shouldSearch || isSearchValid), }); // Use query for fetching first page of SurfSense docs const { data: surfsenseDocs, isLoading: isSurfsenseDocsLoading } = useQuery({ - queryKey: ["surfsense-docs-mention", debouncedSearch], + queryKey: ["surfsense-docs-mention", throttledSearch, isSearchValid], queryFn: () => documentsApiService.getSurfsenseDocs({ queryParams: surfsenseDocsQueryParams }), staleTime: 3 * 60 * 1000, + enabled: !shouldSearch || isSearchValid, }); // Update accumulated documents when first page loads - combine both sources @@ -142,24 +191,17 @@ export const DocumentMentionPicker = forwardRef< } } - // Add regular documents - if (debouncedSearch.trim()) { - if (searchedDocuments?.items) { - combinedDocs.push(...searchedDocuments.items); - setHasMore(searchedDocuments.has_more); - } - } else { - if (documents?.items) { - combinedDocs.push(...documents.items); - setHasMore(documents.has_more); - } + // Add regular documents from lightweight endpoint + if (titleSearchResults?.items) { + combinedDocs.push(...titleSearchResults.items); + setHasMore(titleSearchResults.has_more); } setAccumulatedDocuments(combinedDocs); } - }, [documents, searchedDocuments, surfsenseDocs, debouncedSearch, currentPage]); + }, [titleSearchResults, surfsenseDocs, currentPage]); - // Function to load next page + // Function to load next page using lightweight endpoint const loadNextPage = useCallback(async () => { if (isLoadingMore || !hasMore) return; @@ -167,23 +209,14 @@ export const DocumentMentionPicker = forwardRef< setIsLoadingMore(true); try { - let response: GetDocumentsResponse; - if (debouncedSearch.trim()) { - const queryParams = { - search_space_id: searchSpaceId, - page: nextPage, - page_size: PAGE_SIZE, - title: debouncedSearch, - }; - response = await documentsApiService.searchDocuments({ queryParams }); - } else { - const queryParams = { - search_space_id: searchSpaceId, - page: nextPage, - page_size: PAGE_SIZE, - }; - response = await documentsApiService.getDocuments({ queryParams }); - } + const queryParams = { + search_space_id: searchSpaceId, + page: nextPage, + page_size: PAGE_SIZE, + ...(isSearchValid ? { title: throttledSearch.trim() } : {}), + }; + const response: SearchDocumentTitlesResponse = + await documentsApiService.searchDocumentTitles({ queryParams }); setAccumulatedDocuments((prev) => [...prev, ...response.items]); setHasMore(response.has_more); @@ -193,7 +226,7 @@ export const DocumentMentionPicker = forwardRef< } finally { setIsLoadingMore(false); } - }, [currentPage, hasMore, isLoadingMore, debouncedSearch, searchSpaceId]); + }, [currentPage, hasMore, isLoadingMore, throttledSearch, searchSpaceId, isSearchValid]); // Infinite scroll handler const handleScroll = useCallback( @@ -210,10 +243,10 @@ export const DocumentMentionPicker = forwardRef< ); const actualDocuments = accumulatedDocuments; - const actualLoading = - ((debouncedSearch.trim() ? isSearchedDocumentsLoading : isDocumentsLoading) || - isSurfsenseDocsLoading) && - currentPage === 0; + const actualLoading = (isTitleSearchLoading || isSurfsenseDocsLoading) && currentPage === 0; + + // Show hint when search is too short + const showSearchHint = shouldSearch && !isSearchValid; // Split documents into SurfSense docs and user docs for grouped rendering const surfsenseDocsList = useMemo( @@ -323,7 +356,14 @@ export const DocumentMentionPicker = forwardRef< className="max-h-[180px] sm:max-h-[280px] overflow-y-auto" onScroll={handleScroll} > - {actualLoading ? ( + {showSearchHint ? ( +
+

+ Type {MIN_SEARCH_LENGTH - throttledSearch.trim().length} more character + {MIN_SEARCH_LENGTH - throttledSearch.trim().length > 1 ? "s" : ""} to search +

+
+ ) : actualLoading ? (
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 2b144bd68..ba81562b1 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -155,6 +155,29 @@ export const searchDocumentsResponse = z.object({ has_more: z.boolean(), }); +/** + * Search document titles (lightweight, for mention picker) + */ +export const documentTitleRead = z.object({ + id: z.number(), + title: z.string(), + document_type: documentTypeEnum, +}); + +export const searchDocumentTitlesRequest = z.object({ + queryParams: z.object({ + search_space_id: z.number(), + title: z.string().optional(), + page: z.number().optional(), + page_size: z.number().optional(), + }), +}); + +export const searchDocumentTitlesResponse = z.object({ + items: z.array(documentTitleRead), + has_more: z.boolean(), +}); + /** * Get document type counts */ @@ -223,6 +246,7 @@ export const deleteDocumentResponse = z.object({ }); export type Document = z.infer; +export type DocumentTitleRead = z.infer; export type GetDocumentsRequest = z.infer; export type GetDocumentsResponse = z.infer; export type GetDocumentRequest = z.infer; @@ -233,6 +257,8 @@ export type UploadDocumentRequest = z.infer; export type UploadDocumentResponse = z.infer; export type SearchDocumentsRequest = z.infer; export type SearchDocumentsResponse = z.infer; +export type SearchDocumentTitlesRequest = z.infer; +export type SearchDocumentTitlesResponse = z.infer; export type GetDocumentTypeCountsRequest = z.infer; export type GetDocumentTypeCountsResponse = z.infer; export type GetDocumentByChunkRequest = z.infer; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index bea399f98..daa67f6d5 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -22,8 +22,11 @@ import { getSurfsenseDocsRequest, getSurfsenseDocsResponse, type SearchDocumentsRequest, + type SearchDocumentTitlesRequest, searchDocumentsRequest, searchDocumentsResponse, + searchDocumentTitlesRequest, + searchDocumentTitlesResponse, type UpdateDocumentRequest, type UploadDocumentRequest, updateDocumentRequest, @@ -160,6 +163,35 @@ class DocumentsApiService { return baseApiService.get(`/api/v1/documents/search?${queryParams}`, searchDocumentsResponse); }; + /** + * Search document titles (lightweight, optimized for mention picker) + * Returns only id, title, document_type - no content or metadata + */ + searchDocumentTitles = async (request: SearchDocumentTitlesRequest) => { + const parsedRequest = searchDocumentTitlesRequest.safeParse(request); + + if (!parsedRequest.success) { + console.error("Invalid request:", parsedRequest.error); + + const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); + throw new ValidationError(`Invalid request: ${errorMessage}`); + } + + // Transform query params to be string values + const transformedQueryParams = Object.fromEntries( + Object.entries(parsedRequest.data.queryParams) + .filter(([, v]) => v !== undefined) + .map(([k, v]) => [k, String(v)]) + ); + + const queryParams = new URLSearchParams(transformedQueryParams).toString(); + + return baseApiService.get( + `/api/v1/documents/search/titles?${queryParams}`, + searchDocumentTitlesResponse + ); + }; + /** * Get document type counts */