Merge pull request #707 from AnishSarkar22/fix/mentions

feat: Revamped search in document mentions
This commit is contained in:
Rohan Verma 2026-01-18 22:15:28 -08:00 committed by GitHub
commit 26ef83fbaa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 560 additions and 168 deletions

View file

@ -0,0 +1,76 @@
"""Add pg_trgm indexes for efficient document title search
Revision ID: 67
Revises: 66
Adds the pg_trgm extension and GIN trigram indexes on documents.title
to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%').
Indexes added:
1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%'
2. idx_documents_search_space_id - B-tree on search_space_id for filtering
3. idx_documents_search_space_updated - Composite for recent docs query (covering index)
4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "67"
down_revision: str | None = "66"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Add pg_trgm extension and optimized indexes for document search."""
# Create pg_trgm extension if not exists
# This extension provides trigram-based text similarity functions and operators
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
# 1. GIN trigram index on documents.title for ILIKE '%term%' searches
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_title_trgm
ON documents USING gin (title gin_trgm_ops);
"""
)
# 2. B-tree index on search_space_id for fast filtering
# (Every query filters by search_space_id first)
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id
ON documents (search_space_id);
"""
)
# 3. Covering index for "recent documents" query (no search term)
# Includes id, title, document_type so PostgreSQL can do index-only scan
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated
ON documents (search_space_id, updated_at DESC NULLS LAST)
INCLUDE (id, title, document_type);
"""
)
# 4. GIN trigram index on surfsense_docs_documents.title
op.execute(
"""
CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm
ON surfsense_docs_documents USING gin (title gin_trgm_ops);
"""
)
def downgrade() -> None:
"""Remove all document search indexes (extension is left in place)."""
op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;")
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;")
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;")
op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;")

View file

@ -1007,11 +1007,36 @@ async def setup_indexes():
"CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))"
)
)
# pg_trgm indexes for efficient ILIKE '%term%' searches on titles
# Critical for document mention picker (@mentions) to scale
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)"
)
)
# B-tree index on search_space_id for fast filtering
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)"
)
)
# Covering index for "recent documents" query - enables index-only scan
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)"
)
)
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)"
)
)
async def create_db_and_tables():
async with engine.begin() as conn:
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
await conn.run_sync(Base.metadata.create_all)
await setup_indexes()

View file

@ -19,6 +19,8 @@ from app.db import (
from app.schemas import (
DocumentRead,
DocumentsCreate,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
DocumentWithChunksRead,
PaginatedResponse,
@ -429,6 +431,112 @@ async def search_documents(
) from e
@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse)
async def search_document_titles(
search_space_id: int,
title: str = "",
page: int = 0,
page_size: int = 20,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Lightweight document title search optimized for mention picker (@mentions).
Returns only id, title, and document_type - no content or metadata.
Uses pg_trgm fuzzy search with similarity scoring for typo tolerance.
Results are ordered by relevance using trigram similarity scores.
Args:
search_space_id: The search space to search in. Required.
title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents.
page: Zero-based page index. Default: 0.
page_size: Number of items per page. Default: 20.
session: Database session (injected).
user: Current authenticated user (injected).
Returns:
DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count).
"""
from sqlalchemy import desc, func, or_
try:
# Check permission for the search space
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
# Base query - only select lightweight fields
query = select(
Document.id,
Document.title,
Document.document_type,
).filter(Document.search_space_id == search_space_id)
# If query is too short, return recent documents ordered by updated_at
if len(title.strip()) < 2:
query = query.order_by(Document.updated_at.desc().nullslast())
else:
# Fuzzy search using pg_trgm similarity + ILIKE fallback
search_term = title.strip()
# Similarity threshold for fuzzy matching (0.3 = ~30% trigram overlap)
# Lower values = more fuzzy, higher values = stricter matching
similarity_threshold = 0.3
# Match documents that either:
# 1. Have high trigram similarity (fuzzy match - handles typos)
# 2. Contain the exact substring (ILIKE - handles partial matches)
query = query.filter(
or_(
func.similarity(Document.title, search_term) > similarity_threshold,
Document.title.ilike(f"%{search_term}%"),
)
)
# Order by similarity score (descending) for best relevance ranking
# Higher similarity = better match = appears first
query = query.order_by(
desc(func.similarity(Document.title, search_term)),
Document.title, # Alphabetical tiebreaker
)
# Fetch page_size + 1 to determine has_more without COUNT query
offset = page * page_size
result = await session.execute(query.offset(offset).limit(page_size + 1))
rows = result.all()
# Check if there are more results
has_more = len(rows) > page_size
items = rows[:page_size] # Only return requested page_size
# Convert to response format
api_documents = [
DocumentTitleRead(
id=row.id,
title=row.title,
document_type=row.document_type,
)
for row in items
]
return DocumentTitleSearchResponse(
items=api_documents,
has_more=has_more,
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to search document titles: {e!s}"
) from e
@router.get("/documents/type-counts")
async def get_document_type_counts(
search_space_id: int | None = None,

View file

@ -4,6 +4,8 @@ from .documents import (
DocumentBase,
DocumentRead,
DocumentsCreate,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
DocumentWithChunksRead,
ExtensionDocumentContent,
@ -85,6 +87,8 @@ __all__ = [
# Document schemas
"DocumentBase",
"DocumentRead",
"DocumentTitleRead",
"DocumentTitleSearchResponse",
"DocumentUpdate",
"DocumentWithChunksRead",
"DocumentsCreate",

View file

@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel):
page: int
page_size: int
has_more: bool
class DocumentTitleRead(BaseModel):
"""Lightweight document response for mention picker - only essential fields."""
id: int
title: str
document_type: DocumentType
model_config = ConfigDict(from_attributes=True)
class DocumentTitleSearchResponse(BaseModel):
"""Response for document title search - optimized for typeahead."""
items: list[DocumentTitleRead]
has_more: bool