mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-24 21:38:09 +02:00
- Introduced a new `position` column in the `chunks` table to maintain explicit document order during re-indexing. - Updated migration to add the column without backfilling historical rows to avoid performance issues on large tables. - Adjusted the `Chunk` model to reflect the new column without indexing, as ordering reads are document-scoped.
49 lines
1.9 KiB
Python
49 lines
1.9 KiB
Python
"""add chunks.position for explicit document order
|
|
|
|
Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
|
|
longer reflect document order. The ``position`` column makes that order
|
|
explicit and is written by the indexing pipeline for every new or re-indexed
|
|
document.
|
|
|
|
This migration intentionally does NOT backfill historical rows. On a large,
|
|
heavily-indexed table (notably a multi-hundred-GB HNSW embedding index) a bulk
|
|
UPDATE of every chunk becomes a non-HOT update that rewrites every secondary
|
|
index per row -- turning a one-off migration into a multi-day operation.
|
|
Instead, existing rows keep ``position = 0`` and therefore order by the
|
|
``Chunk.id`` tiebreaker (identical to the pre-feature behavior); new and
|
|
re-indexed documents get correct positions from application code, and any
|
|
document needing exact order can simply be re-indexed on demand.
|
|
|
|
Revision ID: 165
|
|
Revises: 164
|
|
"""
|
|
|
|
from collections.abc import Sequence
|
|
|
|
from alembic import op
|
|
|
|
revision: str = "165"
|
|
down_revision: str | None = "164"
|
|
branch_labels: str | Sequence[str] | None = None
|
|
depends_on: str | Sequence[str] | None = None
|
|
|
|
# Leftover UNLOGGED scratch table from earlier backfill attempts; dropped here
|
|
# so re-running this migration converges the schema regardless of past state.
|
|
SCRATCH_TABLE = "_chunk_position_backfill"
|
|
|
|
|
|
def upgrade() -> None:
|
|
# Adding a NOT NULL column with a constant default is metadata-only on
|
|
# PostgreSQL 11+, so this is fast even on very large tables. IF NOT EXISTS
|
|
# makes it a no-op where the column already exists.
|
|
op.execute(
|
|
"ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
|
|
)
|
|
|
|
# Clean up the scratch table left behind by the abandoned backfill approach.
|
|
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
|
|
|
|
|
|
def downgrade() -> None:
|
|
op.execute(f"DROP TABLE IF EXISTS {SCRATCH_TABLE};")
|
|
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")
|