feat: add pg_trgm indexes and lightweight document title search

- Introduced pg_trgm extension and GIN trigram indexes for efficient document title searches, enhancing performance for mention picker functionality.
- Implemented a new API endpoint for lightweight document title searches, returning only essential fields.
- Updated frontend components to utilize the new title search feature with throttling for improved user experience.
- Added necessary schemas and types for the new search functionality.
This commit is contained in:
Anish Sarkar 2026-01-17 20:45:10 +05:30
parent cf53338119
commit b001b65067
8 changed files with 393 additions and 77 deletions

View file

@ -0,0 +1,77 @@
"""Add pg_trgm indexes for efficient document title search
Revision ID: 67
Revises: 66
Adds the pg_trgm extension and GIN trigram indexes on documents.title
to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%').
Indexes added:
1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%'
2. idx_documents_search_space_id - B-tree on search_space_id for filtering
3. idx_documents_search_space_updated - Composite for recent docs query (covering index)
4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title
This is critical for the document mention picker (@mentions) to scale to 10,000+ documents.
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "67"
down_revision: str | None = "66"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Add pg_trgm extension and optimized indexes for document search."""
# Create pg_trgm extension if not exists
# This extension provides trigram-based text similarity functions and operators
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
# 1. GIN trigram index on documents.title for ILIKE '%term%' searches
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_title_trgm
ON documents USING gin (title gin_trgm_ops);
"""
)
# 2. B-tree index on search_space_id for fast filtering
# (Every query filters by search_space_id first)
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id
ON documents (search_space_id);
"""
)
# 3. Covering index for "recent documents" query (no search term)
# Includes id, title, document_type so PostgreSQL can do index-only scan
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated
ON documents (search_space_id, updated_at DESC NULLS LAST)
INCLUDE (id, title, document_type);
"""
)
# 4. GIN trigram index on surfsense_docs_documents.title
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm
ON surfsense_docs_documents USING gin (title gin_trgm_ops);
"""
)
def downgrade() -> None:
"""Remove all document search indexes (extension is left in place)."""
op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;")
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;")
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;")
op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;")

View file

@ -1007,11 +1007,36 @@ async def setup_indexes():
"CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))"
)
)
# pg_trgm indexes for efficient ILIKE '%term%' searches on titles
# Critical for document mention picker (@mentions) to scale
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)"
)
)
# B-tree index on search_space_id for fast filtering
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)"
)
)
# Covering index for "recent documents" query - enables index-only scan
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)"
)
)
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)"
)
)
async def create_db_and_tables():
async with engine.begin() as conn:
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
await conn.run_sync(Base.metadata.create_all)
await setup_indexes()

View file

@ -19,6 +19,8 @@ from app.db import (
from app.schemas import (
DocumentRead,
DocumentsCreate,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
DocumentWithChunksRead,
PaginatedResponse,
@ -429,6 +431,99 @@ async def search_documents(
) from e
@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse)
async def search_document_titles(
search_space_id: int,
title: str = "",
page: int = 0,
page_size: int = 20,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Lightweight document title search optimized for mention picker (@mentions).
Returns only id, title, and document_type - no content or metadata.
Results are ordered by relevance: prefix matches first, then contains matches.
Args:
search_space_id: The search space to search in. Required.
title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents.
page: Zero-based page index. Default: 0.
page_size: Number of items per page. Default: 20.
session: Database session (injected).
user: Current authenticated user (injected).
Returns:
DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count).
"""
from sqlalchemy import case, literal
try:
# Check permission for the search space
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
# Base query - only select lightweight fields
query = select(
Document.id,
Document.title,
Document.document_type,
).filter(Document.search_space_id == search_space_id)
# If query is too short, return recent documents ordered by updated_at
if len(title.strip()) < 2:
query = query.order_by(Document.updated_at.desc().nullslast())
else:
# Apply title filter with ILIKE (uses pg_trgm index)
search_term = title.strip()
query = query.filter(Document.title.ilike(f"%{search_term}%"))
# Order by relevance: prefix matches first, then alphabetical
# CASE WHEN title ILIKE 'term%' THEN 0 ELSE 1 END
prefix_priority = case(
(Document.title.ilike(f"{search_term}%"), literal(0)),
else_=literal(1),
)
query = query.order_by(prefix_priority, Document.title)
# Fetch page_size + 1 to determine has_more without COUNT query
offset = page * page_size
result = await session.execute(query.offset(offset).limit(page_size + 1))
rows = result.all()
# Check if there are more results
has_more = len(rows) > page_size
items = rows[:page_size] # Only return requested page_size
# Convert to response format
api_documents = [
DocumentTitleRead(
id=row.id,
title=row.title,
document_type=row.document_type,
)
for row in items
]
return DocumentTitleSearchResponse(
items=api_documents,
has_more=has_more,
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to search document titles: {e!s}"
) from e
@router.get("/documents/type-counts")
async def get_document_type_counts(
search_space_id: int | None = None,

View file

@ -4,6 +4,8 @@ from .documents import (
DocumentBase,
DocumentRead,
DocumentsCreate,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
DocumentWithChunksRead,
ExtensionDocumentContent,
@ -85,6 +87,8 @@ __all__ = [
# Document schemas
"DocumentBase",
"DocumentRead",
"DocumentTitleRead",
"DocumentTitleSearchResponse",
"DocumentUpdate",
"DocumentWithChunksRead",
"DocumentsCreate",

View file

@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel):
page: int
page_size: int
has_more: bool
class DocumentTitleRead(BaseModel):
"""Lightweight document response for mention picker - only essential fields."""
id: int
title: str
document_type: DocumentType
model_config = ConfigDict(from_attributes=True)
class DocumentTitleSearchResponse(BaseModel):
"""Response for document title search - optimized for typeahead."""
items: list[DocumentTitleRead]
has_more: bool

View file

@ -1,6 +1,6 @@
"use client";
import { useQuery } from "@tanstack/react-query";
import { useQuery, useQueryClient } from "@tanstack/react-query";
import { FileText } from "lucide-react";
import {
forwardRef,
@ -12,9 +12,8 @@ import {
useState,
} from "react";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import type { Document, GetDocumentsResponse } from "@/contracts/types/document.types";
import type { Document, SearchDocumentTitlesResponse } from "@/contracts/types/document.types";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { cacheKeys } from "@/lib/query-client/cache-keys";
import { cn } from "@/lib/utils";
export interface DocumentMentionPickerRef {
@ -32,14 +31,45 @@ interface DocumentMentionPickerProps {
}
const PAGE_SIZE = 20;
const MIN_SEARCH_LENGTH = 2;
const THROTTLE_MS = 200;
/**
* Throttle hook - fires immediately, then at most once per interval
* Better than debounce for typeahead: user sees results updating as they type
*/
function useThrottled<T>(value: T, delay = THROTTLE_MS) {
const [throttled, setThrottled] = useState(value);
const lastExecuted = useRef(Date.now());
const timeoutRef = useRef<ReturnType<typeof setTimeout>>();
function useDebounced<T>(value: T, delay = 300) {
const [debounced, setDebounced] = useState(value);
useEffect(() => {
const t = setTimeout(() => setDebounced(value), delay);
return () => clearTimeout(t);
const now = Date.now();
const elapsed = now - lastExecuted.current;
if (elapsed >= delay) {
// Enough time has passed, update immediately
lastExecuted.current = now;
setThrottled(value);
} else {
// Schedule update for remaining time
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
}
timeoutRef.current = setTimeout(() => {
lastExecuted.current = Date.now();
setThrottled(value);
}, delay - elapsed);
}
return () => {
if (timeoutRef.current) {
clearTimeout(timeoutRef.current);
}
};
}, [value, delay]);
return debounced;
return throttled;
}
export const DocumentMentionPicker = forwardRef<
@ -49,9 +79,11 @@ export const DocumentMentionPicker = forwardRef<
{ searchSpaceId, onSelectionChange, onDone, initialSelectedDocuments = [], externalSearch = "" },
ref
) {
// Use external search
const queryClient = useQueryClient();
// Use external search with throttle (not debounce) for responsive feel
const search = externalSearch;
const debouncedSearch = useDebounced(search, 150);
const throttledSearch = useThrottled(search, THROTTLE_MS);
const [highlightedIndex, setHighlightedIndex] = useState(0);
const itemRefs = useRef<Map<number, HTMLButtonElement>>(new Map());
const scrollContainerRef = useRef<HTMLDivElement>(null);
@ -64,6 +96,38 @@ export const DocumentMentionPicker = forwardRef<
const [hasMore, setHasMore] = useState(false);
const [isLoadingMore, setIsLoadingMore] = useState(false);
// Check if search is long enough
const isSearchValid = throttledSearch.trim().length >= MIN_SEARCH_LENGTH;
const shouldSearch = throttledSearch.trim().length > 0;
// Prefetch first page when picker mounts - results appear instantly
useEffect(() => {
if (!searchSpaceId) return;
const prefetchParams = {
search_space_id: searchSpaceId,
page: 0,
page_size: PAGE_SIZE,
};
// Prefetch document titles (user docs)
queryClient.prefetchQuery({
queryKey: ["document-titles", prefetchParams],
queryFn: () => documentsApiService.searchDocumentTitles({ queryParams: prefetchParams }),
staleTime: 60 * 1000,
});
// Prefetch SurfSense docs
queryClient.prefetchQuery({
queryKey: ["surfsense-docs-mention", "", false],
queryFn: () =>
documentsApiService.getSurfsenseDocs({
queryParams: { page: 0, page_size: PAGE_SIZE },
}),
staleTime: 3 * 60 * 1000,
});
}, [searchSpaceId, queryClient]);
// Reset pagination when search or search space changes
// biome-ignore lint/correctness/useExhaustiveDependencies: intentionally reset pagination when search/space changes
useEffect(() => {
@ -71,59 +135,44 @@ export const DocumentMentionPicker = forwardRef<
setCurrentPage(0);
setHasMore(false);
setHighlightedIndex(0);
}, [debouncedSearch, searchSpaceId]);
}, [throttledSearch, searchSpaceId]);
// Query params for initial fetch (page 0)
const fetchQueryParams = useMemo(
// Query params for lightweight title search
const titleSearchParams = useMemo(
() => ({
search_space_id: searchSpaceId,
page: 0,
page_size: PAGE_SIZE,
...(isSearchValid ? { title: throttledSearch.trim() } : {}),
}),
[searchSpaceId]
[searchSpaceId, throttledSearch, isSearchValid]
);
const searchQueryParams = useMemo(() => {
return {
search_space_id: searchSpaceId,
page: 0,
page_size: PAGE_SIZE,
title: debouncedSearch,
};
}, [debouncedSearch, searchSpaceId]);
const surfsenseDocsQueryParams = useMemo(() => {
const params: { page: number; page_size: number; title?: string } = {
page: 0,
page_size: PAGE_SIZE,
};
if (debouncedSearch.trim()) {
params.title = debouncedSearch;
if (isSearchValid) {
params.title = throttledSearch.trim();
}
return params;
}, [debouncedSearch]);
}, [throttledSearch, isSearchValid]);
// Use query for fetching first page of documents
const { data: documents, isLoading: isDocumentsLoading } = useQuery({
queryKey: cacheKeys.documents.withQueryParams(fetchQueryParams),
queryFn: () => documentsApiService.getDocuments({ queryParams: fetchQueryParams }),
staleTime: 3 * 60 * 1000,
enabled: !!searchSpaceId && !debouncedSearch.trim() && currentPage === 0,
});
// Searching - first page
const { data: searchedDocuments, isLoading: isSearchedDocumentsLoading } = useQuery({
queryKey: cacheKeys.documents.withQueryParams(searchQueryParams),
queryFn: () => documentsApiService.searchDocuments({ queryParams: searchQueryParams }),
staleTime: 3 * 60 * 1000,
enabled: !!searchSpaceId && !!debouncedSearch.trim() && currentPage === 0,
// Use the new lightweight endpoint for document title search
const { data: titleSearchResults, isLoading: isTitleSearchLoading } = useQuery({
queryKey: ["document-titles", titleSearchParams],
queryFn: () => documentsApiService.searchDocumentTitles({ queryParams: titleSearchParams }),
staleTime: 60 * 1000, // 1 minute - shorter for fresher results
enabled: !!searchSpaceId && currentPage === 0 && (!shouldSearch || isSearchValid),
});
// Use query for fetching first page of SurfSense docs
const { data: surfsenseDocs, isLoading: isSurfsenseDocsLoading } = useQuery({
queryKey: ["surfsense-docs-mention", debouncedSearch],
queryKey: ["surfsense-docs-mention", throttledSearch, isSearchValid],
queryFn: () => documentsApiService.getSurfsenseDocs({ queryParams: surfsenseDocsQueryParams }),
staleTime: 3 * 60 * 1000,
enabled: !shouldSearch || isSearchValid,
});
// Update accumulated documents when first page loads - combine both sources
@ -142,24 +191,17 @@ export const DocumentMentionPicker = forwardRef<
}
}
// Add regular documents
if (debouncedSearch.trim()) {
if (searchedDocuments?.items) {
combinedDocs.push(...searchedDocuments.items);
setHasMore(searchedDocuments.has_more);
}
} else {
if (documents?.items) {
combinedDocs.push(...documents.items);
setHasMore(documents.has_more);
}
// Add regular documents from lightweight endpoint
if (titleSearchResults?.items) {
combinedDocs.push(...titleSearchResults.items);
setHasMore(titleSearchResults.has_more);
}
setAccumulatedDocuments(combinedDocs);
}
}, [documents, searchedDocuments, surfsenseDocs, debouncedSearch, currentPage]);
}, [titleSearchResults, surfsenseDocs, currentPage]);
// Function to load next page
// Function to load next page using lightweight endpoint
const loadNextPage = useCallback(async () => {
if (isLoadingMore || !hasMore) return;
@ -167,23 +209,14 @@ export const DocumentMentionPicker = forwardRef<
setIsLoadingMore(true);
try {
let response: GetDocumentsResponse;
if (debouncedSearch.trim()) {
const queryParams = {
search_space_id: searchSpaceId,
page: nextPage,
page_size: PAGE_SIZE,
title: debouncedSearch,
};
response = await documentsApiService.searchDocuments({ queryParams });
} else {
const queryParams = {
search_space_id: searchSpaceId,
page: nextPage,
page_size: PAGE_SIZE,
};
response = await documentsApiService.getDocuments({ queryParams });
}
const queryParams = {
search_space_id: searchSpaceId,
page: nextPage,
page_size: PAGE_SIZE,
...(isSearchValid ? { title: throttledSearch.trim() } : {}),
};
const response: SearchDocumentTitlesResponse =
await documentsApiService.searchDocumentTitles({ queryParams });
setAccumulatedDocuments((prev) => [...prev, ...response.items]);
setHasMore(response.has_more);
@ -193,7 +226,7 @@ export const DocumentMentionPicker = forwardRef<
} finally {
setIsLoadingMore(false);
}
}, [currentPage, hasMore, isLoadingMore, debouncedSearch, searchSpaceId]);
}, [currentPage, hasMore, isLoadingMore, throttledSearch, searchSpaceId, isSearchValid]);
// Infinite scroll handler
const handleScroll = useCallback(
@ -210,10 +243,10 @@ export const DocumentMentionPicker = forwardRef<
);
const actualDocuments = accumulatedDocuments;
const actualLoading =
((debouncedSearch.trim() ? isSearchedDocumentsLoading : isDocumentsLoading) ||
isSurfsenseDocsLoading) &&
currentPage === 0;
const actualLoading = (isTitleSearchLoading || isSurfsenseDocsLoading) && currentPage === 0;
// Show hint when search is too short
const showSearchHint = shouldSearch && !isSearchValid;
// Split documents into SurfSense docs and user docs for grouped rendering
const surfsenseDocsList = useMemo(
@ -323,7 +356,14 @@ export const DocumentMentionPicker = forwardRef<
className="max-h-[180px] sm:max-h-[280px] overflow-y-auto"
onScroll={handleScroll}
>
{actualLoading ? (
{showSearchHint ? (
<div className="flex flex-col items-center justify-center py-4 text-center px-4">
<p className="text-sm text-muted-foreground">
Type {MIN_SEARCH_LENGTH - throttledSearch.trim().length} more character
{MIN_SEARCH_LENGTH - throttledSearch.trim().length > 1 ? "s" : ""} to search
</p>
</div>
) : actualLoading ? (
<div className="flex items-center justify-center py-4">
<div className="animate-spin h-5 w-5 border-2 border-primary border-t-transparent rounded-full" />
</div>

View file

@ -155,6 +155,29 @@ export const searchDocumentsResponse = z.object({
has_more: z.boolean(),
});
/**
* Search document titles (lightweight, for mention picker)
*/
export const documentTitleRead = z.object({
id: z.number(),
title: z.string(),
document_type: documentTypeEnum,
});
export const searchDocumentTitlesRequest = z.object({
queryParams: z.object({
search_space_id: z.number(),
title: z.string().optional(),
page: z.number().optional(),
page_size: z.number().optional(),
}),
});
export const searchDocumentTitlesResponse = z.object({
items: z.array(documentTitleRead),
has_more: z.boolean(),
});
/**
* Get document type counts
*/
@ -223,6 +246,7 @@ export const deleteDocumentResponse = z.object({
});
export type Document = z.infer<typeof document>;
export type DocumentTitleRead = z.infer<typeof documentTitleRead>;
export type GetDocumentsRequest = z.infer<typeof getDocumentsRequest>;
export type GetDocumentsResponse = z.infer<typeof getDocumentsResponse>;
export type GetDocumentRequest = z.infer<typeof getDocumentRequest>;
@ -233,6 +257,8 @@ export type UploadDocumentRequest = z.infer<typeof uploadDocumentRequest>;
export type UploadDocumentResponse = z.infer<typeof uploadDocumentResponse>;
export type SearchDocumentsRequest = z.infer<typeof searchDocumentsRequest>;
export type SearchDocumentsResponse = z.infer<typeof searchDocumentsResponse>;
export type SearchDocumentTitlesRequest = z.infer<typeof searchDocumentTitlesRequest>;
export type SearchDocumentTitlesResponse = z.infer<typeof searchDocumentTitlesResponse>;
export type GetDocumentTypeCountsRequest = z.infer<typeof getDocumentTypeCountsRequest>;
export type GetDocumentTypeCountsResponse = z.infer<typeof getDocumentTypeCountsResponse>;
export type GetDocumentByChunkRequest = z.infer<typeof getDocumentByChunkRequest>;

View file

@ -22,8 +22,11 @@ import {
getSurfsenseDocsRequest,
getSurfsenseDocsResponse,
type SearchDocumentsRequest,
type SearchDocumentTitlesRequest,
searchDocumentsRequest,
searchDocumentsResponse,
searchDocumentTitlesRequest,
searchDocumentTitlesResponse,
type UpdateDocumentRequest,
type UploadDocumentRequest,
updateDocumentRequest,
@ -160,6 +163,35 @@ class DocumentsApiService {
return baseApiService.get(`/api/v1/documents/search?${queryParams}`, searchDocumentsResponse);
};
/**
* Search document titles (lightweight, optimized for mention picker)
* Returns only id, title, document_type - no content or metadata
*/
searchDocumentTitles = async (request: SearchDocumentTitlesRequest) => {
const parsedRequest = searchDocumentTitlesRequest.safeParse(request);
if (!parsedRequest.success) {
console.error("Invalid request:", parsedRequest.error);
const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
throw new ValidationError(`Invalid request: ${errorMessage}`);
}
// Transform query params to be string values
const transformedQueryParams = Object.fromEntries(
Object.entries(parsedRequest.data.queryParams)
.filter(([, v]) => v !== undefined)
.map(([k, v]) => [k, String(v)])
);
const queryParams = new URLSearchParams(transformedQueryParams).toString();
return baseApiService.get(
`/api/v1/documents/search/titles?${queryParams}`,
searchDocumentTitlesResponse
);
};
/**
* Get document type counts
*/