SurfSense/surfsense_backend/alembic/versions/165_add_chunk_position.py

50 lines
1.9 KiB
Python
Raw Normal View History

"""add chunks.position for explicit document order
Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
longer reflect document order. The ``position`` column makes that order
explicit and is written by the indexing pipeline for every new or re-indexed
document.
This migration intentionally does NOT backfill historical rows. On a large,
heavily-indexed table (notably a multi-hundred-GB HNSW embedding index) a bulk
UPDATE of every chunk becomes a non-HOT update that rewrites every secondary
index per row -- turning a one-off migration into a multi-day operation.
Instead, existing rows keep ``position = 0`` and therefore order by the
``Chunk.id`` tiebreaker (identical to the pre-feature behavior); new and
re-indexed documents get correct positions from application code, and any
document needing exact order can simply be re-indexed on demand.
Revision ID: 165
Revises: 164
"""
from collections.abc import Sequence
from alembic import op
revision: str = "165"
down_revision: str | None = "164"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Leftover UNLOGGED scratch table from earlier backfill attempts; dropped here
# so re-running this migration converges the schema regardless of past state.
SCRATCH_TABLE = "_chunk_position_backfill"
def upgrade() -> None:
# Adding a NOT NULL column with a constant default is metadata-only on
# PostgreSQL 11+, so this is fast even on very large tables. IF NOT EXISTS
# makes it a no-op where the column already exists.
op.execute(
"ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
)
# Clean up the scratch table left behind by the abandoned backfill approach.
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
def downgrade() -> None:
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")