From 5a71769dba8b678baba29bfef0a7fc0c35d7cdd4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:53:08 +0200 Subject: [PATCH] fix(chunks): set position on remaining chunk insert paths document_converters, the github size-fallback chunker, revert_service restores, and the kb-persistence middleware now write explicit positions (the middleware read path also orders by position). --- .../middleware/kb_persistence/middleware.py | 26 +++++++++++++++---- .../app/services/revert_service.py | 22 ++++++++++++---- .../connector_indexers/github_indexer.py | 1 + .../app/utils/document_converters.py | 6 +++-- 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py index ef86eaddd..a6c83a7d4 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py @@ -241,8 +241,15 @@ async def _create_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) session.add_all( [ - Chunk(document_id=doc.id, content=text, embedding=embedding) - for text, embedding in zip(chunks, chunk_embeddings, strict=True) + Chunk( + document_id=doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) ] ) return doc @@ -289,8 +296,15 @@ async def _update_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) session.add_all( [ - Chunk(document_id=document.id, content=text, embedding=embedding) - for text, embedding in zip(chunks, chunk_embeddings, strict=True) + Chunk( + document_id=document.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) ] ) return document @@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot( session: AsyncSession, *, doc_id: int ) -> list[dict[str, str]]: rows = await session.execute( - select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id) + select(Chunk.content) + .where(Chunk.document_id == doc_id) + .order_by(Chunk.position, Chunk.id) ) return [{"content": row.content} for row in rows.all() if row.content is not None] diff --git a/surfsense_backend/app/services/revert_service.py b/surfsense_backend/app/services/revert_service.py index 6db5e2604..0cb6cd092 100644 --- a/surfsense_backend/app/services/revert_service.py +++ b/surfsense_backend/app/services/revert_service.py @@ -238,9 +238,14 @@ async def _restore_in_place_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) session.add_all( [ - Chunk(document_id=doc.id, content=text, embedding=embedding) - for text, embedding in zip( - chunk_texts, chunk_embeddings, strict=True + Chunk( + document_id=doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=True) ) ] ) @@ -336,8 +341,15 @@ async def _reinsert_document_from_revision( chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) session.add_all( [ - Chunk(document_id=new_doc.id, content=text, embedding=embedding) - for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True) + Chunk( + document_id=new_doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=True) + ) ] ) diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index ce9b80e5e..557c2ce71 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -525,6 +525,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: Chunk( content=chunk_text, embedding=embed_text(chunk_text), + position=len(chunks), ) ) diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index 694ae22ac..fef51d692 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -188,8 +188,10 @@ async def create_document_chunks(content: str) -> list[Chunk]: chunk_texts = [c.text for c in config.chunker_instance.chunk(content)] chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) return [ - Chunk(content=text, embedding=emb) - for text, emb in zip(chunk_texts, chunk_embeddings, strict=False) + Chunk(content=text, embedding=emb, position=i) + for i, (text, emb) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=False) + ) ]