refactor(chunks): order chunk reads by (document_id, position)

Presentation and citation ordering moves off Chunk.id/created_at to the explicit position column (id kept as tiebreaker). Vector and ts_rank ranking order_by clauses are untouched.
2026-06-12 20:45:20 +02:00 · 2026-06-12 18:53:21 +02:00 · 2026-06-12 18:53:21 +02:00 · 052e9ef4d1
commit 052e9ef4d1
parent 5a71769dba
9 changed files with 28 additions and 19 deletions
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol):
            chunk_rows = await session.execute(
                select(Chunk.id, Chunk.content)
                .where(Chunk.document_id == document.id)
-                .order_by(Chunk.id)
+                .order_by(Chunk.position, Chunk.id)
            )
            chunks = [
                {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol):
                        .join(Document, Document.id == Chunk.document_id)
                        .where(Document.search_space_id == self.search_space_id)
                        .where(Chunk.content.ilike(f"%{pattern}%"))
-                        .order_by(Chunk.document_id, Chunk.id)
+                        .order_by(Chunk.document_id, Chunk.position, Chunk.id)
                    )
                    chunk_rows = await session.execute(sub)
                    per_doc: dict[int, int] = {}
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py
@ -394,7 +394,10 @@ async def browse_recent_documents(
                Chunk.document_id,
                Chunk.content,
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
+                    partition_by=Chunk.document_id,
+                    order_by=(Chunk.position, Chunk.id),
+                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -404,7 +407,7 @@ async def browse_recent_documents(
        chunk_query = (
            select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
            .where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
-            .order_by(numbered.c.document_id, numbered.c.chunk_id)
+            .order_by(numbered.c.document_id, numbered.c.rn)
        )
        chunk_result = await session.execute(chunk_query)
        fetched_chunks = chunk_result.all()
@ -531,7 +534,7 @@ async def fetch_mentioned_documents(
        chunk_result = await session.execute(
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .where(Chunk.document_id.in_(list(docs.keys())))
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
        for row in chunk_result.all():
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py
@ -122,7 +122,7 @@ async def _browse_recent_documents(
        chunk_query = (
            select(Chunk)
            .where(Chunk.document_id.in_(doc_ids))
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )
        chunk_result = await session.execute(chunk_query)
        raw_chunks = chunk_result.scalars().all()
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever:
            select(
                Chunk.id.label("chunk_id"),
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
+                    partition_by=Chunk.document_id,
+                    order_by=(Chunk.position, Chunk.id),
+                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever:
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .join(numbered, Chunk.id == numbered.c.chunk_id)
            .where(chunk_filter)
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )

        t_fetch = time.perf_counter()
--- a/surfsense_backend/app/retriever/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriever/documents_hybrid_search.py
@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever:
            select(
                Chunk.id.label("chunk_id"),
                func.row_number()
-                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .over(
+                    partition_by=Chunk.document_id,
+                    order_by=(Chunk.position, Chunk.id),
+                )
                .label("rn"),
            )
            .where(Chunk.document_id.in_(doc_ids))
@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever:
            select(Chunk.id, Chunk.content, Chunk.document_id)
            .join(numbered, Chunk.id == numbered.c.chunk_id)
            .where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
-            .order_by(Chunk.document_id, Chunk.id)
+            .order_by(Chunk.document_id, Chunk.position, Chunk.id)
        )

        t_fetch = time.perf_counter()
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -1014,8 +1014,8 @@ async def get_document_by_chunk_id(
            .filter(
                Chunk.document_id == document.id,
                or_(
-                    Chunk.created_at < chunk.created_at,
-                    and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
+                    Chunk.position < chunk.position,
+                    and_(Chunk.position == chunk.position, Chunk.id < chunk.id),
                ),
            )
        )
@ -1027,7 +1027,7 @@ async def get_document_by_chunk_id(
        windowed_result = await session.execute(
            select(Chunk)
            .filter(Chunk.document_id == document.id)
-            .order_by(Chunk.created_at, Chunk.id)
+            .order_by(Chunk.position, Chunk.id)
            .offset(start)
            .limit(end - start)
        )
@ -1137,7 +1137,7 @@ async def get_document_chunks_paginated(
        chunks_result = await session.execute(
            select(Chunk)
            .filter(Chunk.document_id == document_id)
-            .order_by(Chunk.created_at, Chunk.id)
+            .order_by(Chunk.position, Chunk.id)
            .offset(offset)
            .limit(page_size)
        )
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@ -119,7 +119,7 @@ async def get_editor_content(
    chunk_contents_result = await session.execute(
        select(Chunk.content)
        .filter(Chunk.document_id == document_id)
-        .order_by(Chunk.id)
+        .order_by(Chunk.position, Chunk.id)
    )
    chunk_contents = chunk_contents_result.scalars().all()

@ -205,7 +205,7 @@ async def download_document_markdown(
        chunk_contents_result = await session.execute(
            select(Chunk.content)
            .filter(Chunk.document_id == document_id)
-            .order_by(Chunk.id)
+            .order_by(Chunk.position, Chunk.id)
        )
        chunk_contents = chunk_contents_result.scalars().all()
        if chunk_contents:
@ -354,7 +354,7 @@ async def export_document(
        chunk_contents_result = await session.execute(
            select(Chunk.content)
            .filter(Chunk.document_id == document_id)
-            .order_by(Chunk.id)
+            .order_by(Chunk.position, Chunk.id)
        )
        chunk_contents = chunk_contents_result.scalars().all()
        if chunk_contents:
--- a/surfsense_backend/app/services/ai_file_sort_service.py
+++ b/surfsense_backend/app/services/ai_file_sort_service.py
@ -156,7 +156,7 @@ async def _resolve_document_text(
    stmt = (
        select(Chunk.content)
        .where(Chunk.document_id == document.id)
-        .order_by(Chunk.id)
+        .order_by(Chunk.position, Chunk.id)
        .limit(_MAX_CHUNKS_FOR_CONTEXT)
    )
    result = await session.execute(stmt)
--- a/surfsense_backend/app/services/export_service.py
+++ b/surfsense_backend/app/services/export_service.py
@ -62,7 +62,7 @@ async def _get_document_markdown(
    chunk_result = await session.execute(
        select(Chunk.content)
        .filter(Chunk.document_id == document.id)
-        .order_by(Chunk.id)
+        .order_by(Chunk.position, Chunk.id)
    )
    chunks = chunk_result.scalars().all()
    if chunks: