feat(chunks): add explicit position column with backfill migration

Chunk ids stop reflecting document order once incremental re-indexing keeps
unchanged rows across edits. Backfill preserves the historical id ordering
so behavior is identical on day one.
This commit is contained in:
CREDO23 2026-06-12 18:52:45 +02:00
parent 412493ae08
commit c6e71c851c
2 changed files with 58 additions and 1 deletions

View file

@ -0,0 +1,51 @@
"""add chunks.position for explicit document order
Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
longer reflect document order. Backfill preserves the historical id ordering.
Revision ID: 162
Revises: 161
"""
from collections.abc import Sequence
from alembic import op
revision: str = "162"
down_revision: str | None = "161"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute(
"ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
)
# Backfill: document order so far has been the insertion order (id).
op.execute(
"""
UPDATE chunks
SET position = numbered.rn
FROM (
SELECT id,
ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1 AS rn
FROM chunks
) AS numbered
WHERE chunks.id = numbered.id;
"""
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);"
)
op.execute(
"CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "
"ON chunks(document_id, position);"
)
def downgrade() -> None:
op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;")
op.execute("DROP INDEX IF EXISTS ix_chunks_position;")
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")

View file

@ -1484,7 +1484,10 @@ class Document(BaseModel, TimestampMixin):
created_by = relationship("User", back_populates="documents")
connector = relationship("SearchSourceConnector", back_populates="documents")
chunks = relationship(
"Chunk", back_populates="document", cascade="all, delete-orphan"
"Chunk",
back_populates="document",
cascade="all, delete-orphan",
order_by="Chunk.position",
)
# Original upload + future derived artifacts (redacted, filled-form).
# Model lives in app.file_storage.persistence to keep that feature cohesive.
@ -1520,6 +1523,9 @@ class Chunk(BaseModel, TimestampMixin):
content = Column(Text, nullable=False)
embedding = Column(Vector(config.embedding_model_instance.dimension))
# Explicit document order; ids don't follow it since incremental
# re-indexing keeps unchanged rows across edits.
position = Column(Integer, nullable=False, server_default="0", index=True)
document_id = Column(
Integer,