mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
Merge pull request #707 from AnishSarkar22/fix/mentions
feat: Revamped search in document mentions
This commit is contained in:
commit
26ef83fbaa
11 changed files with 560 additions and 168 deletions
|
|
@ -0,0 +1,76 @@
|
|||
"""Add pg_trgm indexes for efficient document title search
|
||||
|
||||
Revision ID: 67
|
||||
Revises: 66
|
||||
|
||||
Adds the pg_trgm extension and GIN trigram indexes on documents.title
|
||||
to enable efficient ILIKE searches with leading wildcards (e.g., '%search_term%').
|
||||
|
||||
Indexes added:
|
||||
1. idx_documents_title_trgm - GIN trigram on title for ILIKE '%term%'
|
||||
2. idx_documents_search_space_id - B-tree on search_space_id for filtering
|
||||
3. idx_documents_search_space_updated - Composite for recent docs query (covering index)
|
||||
4. idx_surfsense_docs_title_trgm - GIN trigram on surfsense docs title
|
||||
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "67"
|
||||
down_revision: str | None = "66"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add pg_trgm extension and optimized indexes for document search."""
|
||||
|
||||
# Create pg_trgm extension if not exists
|
||||
# This extension provides trigram-based text similarity functions and operators
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
|
||||
|
||||
# 1. GIN trigram index on documents.title for ILIKE '%term%' searches
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_title_trgm
|
||||
ON documents USING gin (title gin_trgm_ops);
|
||||
"""
|
||||
)
|
||||
|
||||
# 2. B-tree index on search_space_id for fast filtering
|
||||
# (Every query filters by search_space_id first)
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id
|
||||
ON documents (search_space_id);
|
||||
"""
|
||||
)
|
||||
|
||||
# 3. Covering index for "recent documents" query (no search term)
|
||||
# Includes id, title, document_type so PostgreSQL can do index-only scan
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated
|
||||
ON documents (search_space_id, updated_at DESC NULLS LAST)
|
||||
INCLUDE (id, title, document_type);
|
||||
"""
|
||||
)
|
||||
|
||||
# 4. GIN trigram index on surfsense_docs_documents.title
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm
|
||||
ON surfsense_docs_documents USING gin (title gin_trgm_ops);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove all document search indexes (extension is left in place)."""
|
||||
op.execute("DROP INDEX IF EXISTS idx_surfsense_docs_title_trgm;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_updated;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_documents_search_space_id;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_documents_title_trgm;")
|
||||
|
|
@ -1007,11 +1007,36 @@ async def setup_indexes():
|
|||
"CREATE INDEX IF NOT EXISTS chucks_search_index ON chunks USING gin (to_tsvector('english', content))"
|
||||
)
|
||||
)
|
||||
# pg_trgm indexes for efficient ILIKE '%term%' searches on titles
|
||||
# Critical for document mention picker (@mentions) to scale
|
||||
await conn.execute(
|
||||
text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents USING gin (title gin_trgm_ops)"
|
||||
)
|
||||
)
|
||||
# B-tree index on search_space_id for fast filtering
|
||||
await conn.execute(
|
||||
text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents (search_space_id)"
|
||||
)
|
||||
)
|
||||
# Covering index for "recent documents" query - enables index-only scan
|
||||
await conn.execute(
|
||||
text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_documents_search_space_updated ON documents (search_space_id, updated_at DESC NULLS LAST) INCLUDE (id, title, document_type)"
|
||||
)
|
||||
)
|
||||
await conn.execute(
|
||||
text(
|
||||
"CREATE INDEX IF NOT EXISTS idx_surfsense_docs_title_trgm ON surfsense_docs_documents USING gin (title gin_trgm_ops)"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def create_db_and_tables():
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
||||
await conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
await setup_indexes()
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ from app.db import (
|
|||
from app.schemas import (
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
DocumentWithChunksRead,
|
||||
PaginatedResponse,
|
||||
|
|
@ -429,6 +431,112 @@ async def search_documents(
|
|||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse)
|
||||
async def search_document_titles(
|
||||
search_space_id: int,
|
||||
title: str = "",
|
||||
page: int = 0,
|
||||
page_size: int = 20,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Lightweight document title search optimized for mention picker (@mentions).
|
||||
|
||||
Returns only id, title, and document_type - no content or metadata.
|
||||
Uses pg_trgm fuzzy search with similarity scoring for typo tolerance.
|
||||
Results are ordered by relevance using trigram similarity scores.
|
||||
|
||||
Args:
|
||||
search_space_id: The search space to search in. Required.
|
||||
title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents.
|
||||
page: Zero-based page index. Default: 0.
|
||||
page_size: Number of items per page. Default: 20.
|
||||
session: Database session (injected).
|
||||
user: Current authenticated user (injected).
|
||||
|
||||
Returns:
|
||||
DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count).
|
||||
"""
|
||||
from sqlalchemy import desc, func, or_
|
||||
|
||||
try:
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
search_space_id,
|
||||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
# Base query - only select lightweight fields
|
||||
query = select(
|
||||
Document.id,
|
||||
Document.title,
|
||||
Document.document_type,
|
||||
).filter(Document.search_space_id == search_space_id)
|
||||
|
||||
# If query is too short, return recent documents ordered by updated_at
|
||||
if len(title.strip()) < 2:
|
||||
query = query.order_by(Document.updated_at.desc().nullslast())
|
||||
else:
|
||||
# Fuzzy search using pg_trgm similarity + ILIKE fallback
|
||||
search_term = title.strip()
|
||||
|
||||
# Similarity threshold for fuzzy matching (0.3 = ~30% trigram overlap)
|
||||
# Lower values = more fuzzy, higher values = stricter matching
|
||||
similarity_threshold = 0.3
|
||||
|
||||
# Match documents that either:
|
||||
# 1. Have high trigram similarity (fuzzy match - handles typos)
|
||||
# 2. Contain the exact substring (ILIKE - handles partial matches)
|
||||
query = query.filter(
|
||||
or_(
|
||||
func.similarity(Document.title, search_term) > similarity_threshold,
|
||||
Document.title.ilike(f"%{search_term}%"),
|
||||
)
|
||||
)
|
||||
|
||||
# Order by similarity score (descending) for best relevance ranking
|
||||
# Higher similarity = better match = appears first
|
||||
query = query.order_by(
|
||||
desc(func.similarity(Document.title, search_term)),
|
||||
Document.title, # Alphabetical tiebreaker
|
||||
)
|
||||
|
||||
# Fetch page_size + 1 to determine has_more without COUNT query
|
||||
offset = page * page_size
|
||||
result = await session.execute(query.offset(offset).limit(page_size + 1))
|
||||
rows = result.all()
|
||||
|
||||
# Check if there are more results
|
||||
has_more = len(rows) > page_size
|
||||
items = rows[:page_size] # Only return requested page_size
|
||||
|
||||
# Convert to response format
|
||||
api_documents = [
|
||||
DocumentTitleRead(
|
||||
id=row.id,
|
||||
title=row.title,
|
||||
document_type=row.document_type,
|
||||
)
|
||||
for row in items
|
||||
]
|
||||
|
||||
return DocumentTitleSearchResponse(
|
||||
items=api_documents,
|
||||
has_more=has_more,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to search document titles: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/type-counts")
|
||||
async def get_document_type_counts(
|
||||
search_space_id: int | None = None,
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ from .documents import (
|
|||
DocumentBase,
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
DocumentWithChunksRead,
|
||||
ExtensionDocumentContent,
|
||||
|
|
@ -85,6 +87,8 @@ __all__ = [
|
|||
# Document schemas
|
||||
"DocumentBase",
|
||||
"DocumentRead",
|
||||
"DocumentTitleRead",
|
||||
"DocumentTitleSearchResponse",
|
||||
"DocumentUpdate",
|
||||
"DocumentWithChunksRead",
|
||||
"DocumentsCreate",
|
||||
|
|
|
|||
|
|
@ -67,3 +67,20 @@ class PaginatedResponse[T](BaseModel):
|
|||
page: int
|
||||
page_size: int
|
||||
has_more: bool
|
||||
|
||||
|
||||
class DocumentTitleRead(BaseModel):
|
||||
"""Lightweight document response for mention picker - only essential fields."""
|
||||
|
||||
id: int
|
||||
title: str
|
||||
document_type: DocumentType
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class DocumentTitleSearchResponse(BaseModel):
|
||||
"""Response for document title search - optimized for typeahead."""
|
||||
|
||||
items: list[DocumentTitleRead]
|
||||
has_more: bool
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue