fix(chunks): set position on remaining chunk insert paths

document_converters, the github size-fallback chunker, revert_service restores, and the kb-persistence middleware now write explicit positions (the middleware read path also orders by position).
2026-06-26 21:39:43 +02:00 · 2026-06-12 18:53:08 +02:00 · 2026-06-12 18:53:08 +02:00 · 5a71769dba
commit 5a71769dba
parent 7d55aaf2c1
4 changed files with 43 additions and 12 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
@ -241,8 +241,15 @@ async def _create_document(
        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
        session.add_all(
            [
-                Chunk(document_id=doc.id, content=text, embedding=embedding)
+                Chunk(
-                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+                    document_id=doc.id,
                    content=text,
                    embedding=embedding,
                    position=i,
                )
                for i, (text, embedding) in enumerate(
                    zip(chunks, chunk_embeddings, strict=True)
                )
            ]
        )
    return doc
@ -289,8 +296,15 @@ async def _update_document(
        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
        session.add_all(
            [
-                Chunk(document_id=document.id, content=text, embedding=embedding)
+                Chunk(
-                for text, embedding in zip(chunks, chunk_embeddings, strict=True)
+                    document_id=document.id,
                    content=text,
                    embedding=embedding,
                    position=i,
                )
                for i, (text, embedding) in enumerate(
                    zip(chunks, chunk_embeddings, strict=True)
                )
            ]
        )
    return document
@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot(
    session: AsyncSession, *, doc_id: int
 ) -> list[dict[str, str]]:
    rows = await session.execute(
-        select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id)
+        select(Chunk.content)
        .where(Chunk.document_id == doc_id)
        .order_by(Chunk.position, Chunk.id)
    )
    return [{"content": row.content} for row in rows.all() if row.content is not None]
--- a/surfsense_backend/app/services/revert_service.py
+++ b/surfsense_backend/app/services/revert_service.py
@ -238,9 +238,14 @@ async def _restore_in_place_document(
            chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
            session.add_all(
                [
-                    Chunk(document_id=doc.id, content=text, embedding=embedding)
+                    Chunk(
-                    for text, embedding in zip(
+                        document_id=doc.id,
-                        chunk_texts, chunk_embeddings, strict=True
+                        content=text,
                        embedding=embedding,
                        position=i,
                    )
                    for i, (text, embedding) in enumerate(
                        zip(chunk_texts, chunk_embeddings, strict=True)
                    )
                ]
            )
@ -336,8 +341,15 @@ async def _reinsert_document_from_revision(
        chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
        session.add_all(
            [
-                Chunk(document_id=new_doc.id, content=text, embedding=embedding)
+                Chunk(
-                for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True)
+                    document_id=new_doc.id,
                    content=text,
                    embedding=embedding,
                    position=i,
                )
                for i, (text, embedding) in enumerate(
                    zip(chunk_texts, chunk_embeddings, strict=True)
                )
            ]
        )
--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -525,6 +525,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
                Chunk(
                    content=chunk_text,
                    embedding=embed_text(chunk_text),
                    position=len(chunks),
                )
            )
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@ -188,8 +188,10 @@ async def create_document_chunks(content: str) -> list[Chunk]:
    chunk_texts = [c.text for c in config.chunker_instance.chunk(content)]
    chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
    return [
-        Chunk(content=text, embedding=emb)
+        Chunk(content=text, embedding=emb, position=i)
-        for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)
+        for i, (text, emb) in enumerate(
            zip(chunk_texts, chunk_embeddings, strict=False)
        )
    ]