mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
feat(chunks): add explicit position column with backfill migration
Chunk ids stop reflecting document order once incremental re-indexing keeps unchanged rows across edits. Backfill preserves the historical id ordering so behavior is identical on day one.
This commit is contained in:
parent
412493ae08
commit
c6e71c851c
2 changed files with 58 additions and 1 deletions
51
surfsense_backend/alembic/versions/162_add_chunk_position.py
Normal file
51
surfsense_backend/alembic/versions/162_add_chunk_position.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
"""add chunks.position for explicit document order
|
||||
|
||||
Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no
|
||||
longer reflect document order. Backfill preserves the historical id ordering.
|
||||
|
||||
Revision ID: 162
|
||||
Revises: 161
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "162"
|
||||
down_revision: str | None = "161"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;"
|
||||
)
|
||||
|
||||
# Backfill: document order so far has been the insertion order (id).
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE chunks
|
||||
SET position = numbered.rn
|
||||
FROM (
|
||||
SELECT id,
|
||||
ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1 AS rn
|
||||
FROM chunks
|
||||
) AS numbered
|
||||
WHERE chunks.id = numbered.id;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);"
|
||||
)
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position "
|
||||
"ON chunks(document_id, position);"
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;")
|
||||
op.execute("DROP INDEX IF EXISTS ix_chunks_position;")
|
||||
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;")
|
||||
|
|
@ -1484,7 +1484,10 @@ class Document(BaseModel, TimestampMixin):
|
|||
created_by = relationship("User", back_populates="documents")
|
||||
connector = relationship("SearchSourceConnector", back_populates="documents")
|
||||
chunks = relationship(
|
||||
"Chunk", back_populates="document", cascade="all, delete-orphan"
|
||||
"Chunk",
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
order_by="Chunk.position",
|
||||
)
|
||||
# Original upload + future derived artifacts (redacted, filled-form).
|
||||
# Model lives in app.file_storage.persistence to keep that feature cohesive.
|
||||
|
|
@ -1520,6 +1523,9 @@ class Chunk(BaseModel, TimestampMixin):
|
|||
|
||||
content = Column(Text, nullable=False)
|
||||
embedding = Column(Vector(config.embedding_model_instance.dimension))
|
||||
# Explicit document order; ids don't follow it since incremental
|
||||
# re-indexing keeps unchanged rows across edits.
|
||||
position = Column(Integer, nullable=False, server_default="0", index=True)
|
||||
|
||||
document_id = Column(
|
||||
Integer,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue