perf: add missing index on chunks.document_id for faster search retrieval

2026-06-20 21:18:13 +02:00 · 2026-03-09 21:16:53 +02:00 · 2026-03-09 21:16:53 +02:00 · fffef4cb5e
commit fffef4cb5e
parent 6eabfe2396
2 changed files with 42 additions and 1 deletions
--- a/surfsense_backend/alembic/versions/104_add_chunks_document_id_index.py
+++ b/surfsense_backend/alembic/versions/104_add_chunks_document_id_index.py
@ -0,0 +1,41 @@
 """104_add_chunks_document_id_index
 Revision ID: 104
 Revises: 103
 Create Date: 2026-03-09
 Adds a B-tree index on chunks.document_id to speed up chunk lookups
 during hybrid search (both retrievers fetch chunks by document_id
 after RRF ranking selects the top documents).
 """
 from __future__ import annotations
 from collections.abc import Sequence
 from alembic import op
 revision: str = "104"
 down_revision: str | None = "103"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    op.execute(
        """
        DO $$
        BEGIN
            IF NOT EXISTS (
                SELECT 1 FROM pg_indexes
                WHERE tablename = 'chunks' AND indexname = 'ix_chunks_document_id'
            ) THEN
                CREATE INDEX ix_chunks_document_id ON chunks(document_id);
            END IF;
        END$$;
        """
    )
 def downgrade() -> None:
    op.execute("DROP INDEX IF EXISTS ix_chunks_document_id")
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -959,7 +959,7 @@ class Chunk(BaseModel, TimestampMixin):
    embedding = Column(Vector(config.embedding_model_instance.dimension))
    document_id = Column(
-        Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
+        Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True
    )
    document = relationship("Document", back_populates="chunks")