diff --git a/surfsense_backend/alembic/versions/162_add_chunk_position.py b/surfsense_backend/alembic/versions/162_add_chunk_position.py new file mode 100644 index 000000000..cb240e3ef --- /dev/null +++ b/surfsense_backend/alembic/versions/162_add_chunk_position.py @@ -0,0 +1,51 @@ +"""add chunks.position for explicit document order + +Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no +longer reflect document order. Backfill preserves the historical id ordering. + +Revision ID: 162 +Revises: 161 +""" + +from collections.abc import Sequence + +from alembic import op + +revision: str = "162" +down_revision: str | None = "161" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.execute( + "ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;" + ) + + # Backfill: document order so far has been the insertion order (id). + op.execute( + """ + UPDATE chunks + SET position = numbered.rn + FROM ( + SELECT id, + ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1 AS rn + FROM chunks + ) AS numbered + WHERE chunks.id = numbered.id; + """ + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position " + "ON chunks(document_id, position);" + ) + + +def downgrade() -> None: + op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;") + op.execute("DROP INDEX IF EXISTS ix_chunks_position;") + op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 9ec13f4e2..8d110bbf1 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1484,7 +1484,10 @@ class Document(BaseModel, TimestampMixin): created_by = relationship("User", back_populates="documents") connector = relationship("SearchSourceConnector", back_populates="documents") chunks = relationship( - "Chunk", back_populates="document", cascade="all, delete-orphan" + "Chunk", + back_populates="document", + cascade="all, delete-orphan", + order_by="Chunk.position", ) # Original upload + future derived artifacts (redacted, filled-form). # Model lives in app.file_storage.persistence to keep that feature cohesive. @@ -1520,6 +1523,9 @@ class Chunk(BaseModel, TimestampMixin): content = Column(Text, nullable=False) embedding = Column(Vector(config.embedding_model_instance.dimension)) + # Explicit document order; ids don't follow it since incremental + # re-indexing keeps unchanged rows across edits. + position = Column(Integer, nullable=False, server_default="0", index=True) document_id = Column( Integer,