From b0a0eb7f9c5714c95dedaeab90d1448debb05890 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 19:23:49 +0200
Subject: [PATCH 01/47] fix: editor routes serve source_markdown only, never
 rebuild from chunks

---
 surfsense_backend/app/routes/editor_routes.py | 105 ++++++------------
 1 file changed, 37 insertions(+), 68 deletions(-)

diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index 8250fff98..db46e4ee0 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -42,6 +42,34 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
 EDITOR_PLATE_MAX_LINES = 5000
 
 
+def _raise_no_canonical_body(document: Document) -> None:
+    """Translate a missing source_markdown into a status-aware HTTP error."""
+    doc_status = document.status or {}
+    state = (
+        doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
+    )
+
+    if state in ("pending", "processing"):
+        raise HTTPException(
+            status_code=409,
+            detail="This document is still being processed. Please wait a moment and try again.",
+        )
+    if state == "failed":
+        reason = (
+            doc_status.get("reason", "Unknown error")
+            if isinstance(doc_status, dict)
+            else "Unknown error"
+        )
+        raise HTTPException(
+            status_code=422,
+            detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
+        )
+    raise HTTPException(
+        status_code=400,
+        detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
+    )
+
+
 @router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
 async def get_editor_content(
     search_space_id: int,
@@ -52,8 +80,9 @@ async def get_editor_content(
     """
     Get document content for editing.
 
-    Returns source_markdown for the Plate.js editor.
-    Falls back to blocknote_document → markdown conversion, then chunk reconstruction.
+    Returns source_markdown (the canonical body) for the Plate.js editor, with a
+    one-time migration from legacy blocknote_document. Never reconstructs the
+    body from chunks.
 
     Requires DOCUMENTS_READ permission.
     """
@@ -123,52 +152,9 @@ async def get_editor_content(
         await session.commit()
         return _build_response(empty_markdown)
 
-    chunk_contents_result = await session.execute(
-        select(Chunk.content)
-        .filter(Chunk.document_id == document_id)
-        .order_by(Chunk.position, Chunk.id)
-    )
-    chunk_contents = chunk_contents_result.scalars().all()
-
-    if not chunk_contents:
-        doc_status = document.status or {}
-        state = (
-            doc_status.get("state", "ready")
-            if isinstance(doc_status, dict)
-            else "ready"
-        )
-        if state in ("pending", "processing"):
-            raise HTTPException(
-                status_code=409,
-                detail="This document is still being processed. Please wait a moment and try again.",
-            )
-        if state == "failed":
-            reason = (
-                doc_status.get("reason", "Unknown error")
-                if isinstance(doc_status, dict)
-                else "Unknown error"
-            )
-            raise HTTPException(
-                status_code=422,
-                detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
-            )
-        raise HTTPException(
-            status_code=400,
-            detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
-        )
-
-    markdown_content = "\n\n".join(chunk_contents)
-
-    if not markdown_content.strip():
-        raise HTTPException(
-            status_code=400,
-            detail="This document appears to be empty. Try re-uploading or editing it to add content.",
-        )
-
-    document.source_markdown = markdown_content
-    await session.commit()
-
-    return _build_response(markdown_content)
+    # No canonical body. Chunks are an index artifact, never the source of
+    # truth, so surface the processing state instead of rebuilding from them.
+    _raise_no_canonical_body(document)
 
 
 @router.get(
@@ -181,8 +167,9 @@ async def download_document_markdown(
     user: User = Depends(current_active_user),
 ):
     """
-    Download the full document content as a .md file.
-    Reconstructs markdown from source_markdown or chunks.
+    Download the canonical document body as a .md file.
+
+    Serves source_markdown, migrating legacy blocknote_document when present.
     """
     await check_permission(
         session,
@@ -208,15 +195,6 @@ async def download_document_markdown(
         from app.utils.blocknote_to_markdown import blocknote_to_markdown
 
         markdown = blocknote_to_markdown(document.blocknote_document)
-    if markdown is None:
-        chunk_contents_result = await session.execute(
-            select(Chunk.content)
-            .filter(Chunk.document_id == document_id)
-            .order_by(Chunk.position, Chunk.id)
-        )
-        chunk_contents = chunk_contents_result.scalars().all()
-        if chunk_contents:
-            markdown = "\n\n".join(chunk_contents)
 
     if not markdown or not markdown.strip():
         raise HTTPException(
@@ -357,15 +335,6 @@ async def export_document(
         from app.utils.blocknote_to_markdown import blocknote_to_markdown
 
         markdown_content = blocknote_to_markdown(document.blocknote_document)
-    if markdown_content is None:
-        chunk_contents_result = await session.execute(
-            select(Chunk.content)
-            .filter(Chunk.document_id == document_id)
-            .order_by(Chunk.position, Chunk.id)
-        )
-        chunk_contents = chunk_contents_result.scalars().all()
-        if chunk_contents:
-            markdown_content = "\n\n".join(chunk_contents)
 
     if not markdown_content or not markdown_content.strip():
         raise HTTPException(status_code=400, detail="Document has no content to export")

From b4468976384ba0a9938fb635841e9f58746b6062 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 19:23:49 +0200
Subject: [PATCH 02/47] test: editor read paths never reconstruct body from
 chunks

---
 .../tests/integration/test_editor_routes.py   | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 surfsense_backend/tests/integration/test_editor_routes.py

diff --git a/surfsense_backend/tests/integration/test_editor_routes.py b/surfsense_backend/tests/integration/test_editor_routes.py
new file mode 100644
index 000000000..382d4b4de
--- /dev/null
+++ b/surfsense_backend/tests/integration/test_editor_routes.py
@@ -0,0 +1,175 @@
+"""Phase A contract: editor read paths serve source_markdown and never
+reconstruct or mutate the body from chunks."""
+
+import pytest
+import pytest_asyncio
+from fastapi import HTTPException
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+    Chunk,
+    Document,
+    DocumentStatus,
+    DocumentType,
+    SearchSpace,
+    User,
+)
+
+pytestmark = pytest.mark.integration
+
+
+async def _make_document(
+    session: AsyncSession,
+    search_space: SearchSpace,
+    user: User,
+    *,
+    document_type: DocumentType = DocumentType.FILE,
+    source_markdown: str | None = "# Title\n\nBody line.",
+    content: str = "Body line.",
+    status: dict | None = None,
+) -> Document:
+    doc = Document(
+        title="Doc",
+        document_type=document_type,
+        document_metadata={},
+        content=content,
+        content_hash="hash-001",
+        source_markdown=source_markdown,
+        search_space_id=search_space.id,
+        created_by_id=user.id,
+        status=status or DocumentStatus.ready(),
+    )
+    session.add(doc)
+    await session.flush()
+    return doc
+
+
+async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
+    for position, text in enumerate(texts):
+        session.add(Chunk(content=text, position=position, document_id=document.id))
+    await session.flush()
+
+
+@pytest_asyncio.fixture
+async def make_document(db_session, db_search_space, db_user):
+    async def _make(**overrides):
+        return await _make_document(db_session, db_search_space, db_user, **overrides)
+
+    return _make
+
+
+class TestGetEditorContent:
+    async def test_returns_source_markdown_verbatim(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import get_editor_content
+
+        doc = await make_document(source_markdown="# Real\n\nCanonical body.")
+
+        result = await get_editor_content(
+            db_search_space.id, doc.id, session=db_session, user=db_user
+        )
+
+        assert result["source_markdown"] == "# Real\n\nCanonical body."
+
+    async def test_does_not_reconstruct_body_from_chunks(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        """A ready document without source_markdown must not be rebuilt from chunks."""
+        from app.routes.editor_routes import get_editor_content
+
+        doc = await make_document(source_markdown=None)
+        await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
+
+        with pytest.raises(HTTPException) as exc:
+            await get_editor_content(
+                db_search_space.id, doc.id, session=db_session, user=db_user
+            )
+
+        assert exc.value.status_code == 400
+        await db_session.refresh(doc)
+        assert doc.source_markdown is None
+
+    async def test_processing_document_without_body_returns_409(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import get_editor_content
+
+        doc = await make_document(
+            source_markdown=None, status=DocumentStatus.processing()
+        )
+
+        with pytest.raises(HTTPException) as exc:
+            await get_editor_content(
+                db_search_space.id, doc.id, session=db_session, user=db_user
+            )
+
+        assert exc.value.status_code == 409
+
+    async def test_failed_document_without_body_returns_422(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import get_editor_content
+
+        doc = await make_document(
+            source_markdown=None, status=DocumentStatus.failed("boom")
+        )
+
+        with pytest.raises(HTTPException) as exc:
+            await get_editor_content(
+                db_search_space.id, doc.id, session=db_session, user=db_user
+            )
+
+        assert exc.value.status_code == 422
+
+    async def test_empty_note_initializes_to_empty_markdown(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import get_editor_content
+
+        doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
+
+        result = await get_editor_content(
+            db_search_space.id, doc.id, session=db_session, user=db_user
+        )
+
+        assert result["source_markdown"] == ""
+
+
+class TestDownloadMarkdown:
+    async def test_does_not_reconstruct_body_from_chunks(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import download_document_markdown
+
+        doc = await make_document(source_markdown=None)
+        await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
+
+        with pytest.raises(HTTPException) as exc:
+            await download_document_markdown(
+                db_search_space.id, doc.id, session=db_session, user=db_user
+            )
+
+        assert exc.value.status_code == 400
+
+
+class TestExportDocument:
+    async def test_does_not_reconstruct_body_from_chunks(
+        self, db_session, db_search_space, db_user, make_document
+    ):
+        from app.routes.editor_routes import export_document
+        from app.routes.reports_routes import ExportFormat
+
+        doc = await make_document(source_markdown=None)
+        await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
+
+        with pytest.raises(HTTPException) as exc:
+            await export_document(
+                db_search_space.id,
+                doc.id,
+                format=ExportFormat.PLAIN,
+                session=db_session,
+                user=db_user,
+            )
+
+        assert exc.value.status_code == 400

From b89f242a89997f031dfdfa370ae6fa0f60d66f4e Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 03/47] feat: add start_char/end_char span columns to chunk
 model

---
 surfsense_backend/app/db.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 3f098d5d2..9aa217d2c 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -1467,6 +1467,11 @@ class Chunk(BaseModel, TimestampMixin):
     # ordering reads are document-scoped (covered by ix_chunks_document_id) and
     # building a position index on the large chunks table is not worth it.
     position = Column(Integer, nullable=False, server_default="0")
+    # Half-open char span into the document's source_markdown the chunk was cut
+    # from. Nullable: historical rows predate spans and populate on reindex.
+    # Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
+    start_char = Column(Integer, nullable=True)
+    end_char = Column(Integer, nullable=True)
 
     document_id = Column(
         Integer,

From 1048490ba87f809dc6f95416bc81a872337d5b64 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 04/47] feat: migrate chunks with start_char/end_char columns

---
 .../versions/166_add_chunk_char_spans.py      | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 surfsense_backend/alembic/versions/166_add_chunk_char_spans.py

diff --git a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py b/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py
new file mode 100644
index 000000000..336711612
--- /dev/null
+++ b/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py
@@ -0,0 +1,31 @@
+"""add chunks.start_char/end_char for citation offsets
+
+Char offsets into the document's source_markdown (half-open span) let citations
+resolve the exact passage a chunk came from. Nullable because historical rows
+have no span; they populate on the next connector sync or user edit/reindex.
+
+No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
+every secondary index per row (see migration 165 for the same reasoning).
+
+Revision ID: 166
+Revises: 165
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "166"
+down_revision: str | None = "165"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
+    op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
+    op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")

From 0ab773cbcdf9b4e27898fb411b46ed224dc93a2d Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 05/47] feat: add lossless span-aware chunk_markdown_with_spans

---
 .../app/indexing_pipeline/document_chunker.py | 100 ++++++++++++------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/surfsense_backend/app/indexing_pipeline/document_chunker.py b/surfsense_backend/app/indexing_pipeline/document_chunker.py
index 6ae81b7a8..096624109 100644
--- a/surfsense_backend/app/indexing_pipeline/document_chunker.py
+++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py
@@ -1,16 +1,30 @@
 import re
+from dataclasses import dataclass
 
 from app.config import config
 
 # Regex that matches a Markdown table block (header + separator + one or more rows)
 # A table block starts with a | at the beginning of a line and ends when a
-# non-table line (or end of string) is encountered.
+# non-table line (or end of string) is encountered. The final row may end at EOF
+# without a trailing newline, so the whole table stays one slice.
 _TABLE_BLOCK_RE = re.compile(
-    r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
+    r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
     re.MULTILINE,
 )
 
 
+@dataclass(frozen=True, slots=True)
+class ChunkSlice:
+    """A chunk paired with its half-open char span into the source markdown.
+
+    Invariant: ``markdown[start_char:end_char] == text``.
+    """
+
+    text: str
+    start_char: int
+    end_char: int
+
+
 def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
     """Chunk a text string using the configured chunker and return the chunk texts."""
     chunker = (
@@ -19,41 +33,63 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
     return [c.text for c in chunker.chunk(text)]
 
 
-def chunk_text_hybrid(text: str) -> list[str]:
-    """Table-aware chunker that prevents Markdown tables from being split mid-row.
+def chunk_markdown_with_spans(
+    text: str, use_code_chunker: bool = False
+) -> list[ChunkSlice]:
+    """Chunk markdown into a lossless, contiguous partition of char-addressed slices.
 
-    Algorithm:
-    1. Scan the document for Markdown table blocks.
-    2. Each table block is emitted as a single, unmodified chunk so that its
-       header, separator row, and data rows always stay together.
-    3. The non-table prose segments between (and around) tables are passed through
-       the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
-       document order.
-
-    This ensures that table data is never sliced in the middle by the token-based
-    chunker, which would otherwise produce garbled rows that are useless for RAG.
-
-    Fixes #1334.
+    Tables stay whole (issue #1334) and every slice is an exact substring of
+    ``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
+    the offset record citations resolve against.
     """
-    chunks: list[str] = []
+    if not text:
+        return []
+
+    slices: list[ChunkSlice] = []
     cursor = 0
 
     for match in _TABLE_BLOCK_RE.finditer(text):
-        # Prose before this table
-        prose = text[cursor : match.start()].strip()
-        if prose:
-            chunks.extend(chunk_text(prose))
-
-        # The table itself is kept as one indivisible chunk
-        table_block = match.group(0).strip()
-        if table_block:
-            chunks.append(table_block)
-
+        if match.start() > cursor:
+            slices.extend(
+                _segment_slices(text, cursor, match.start(), use_code_chunker)
+            )
+        slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
         cursor = match.end()
 
-    # Remaining prose after the last table (or entire text if no tables)
-    trailing = text[cursor:].strip()
-    if trailing:
-        chunks.extend(chunk_text(trailing))
+    if len(text) > cursor:
+        slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
 
-    return chunks
+    return slices
+
+
+def _segment_slices(
+    text: str, start: int, end: int, use_code_chunker: bool
+) -> list[ChunkSlice]:
+    """Sub-chunk one non-table segment into contiguous, char-addressed slices."""
+    chunker = (
+        config.code_chunker_instance if use_code_chunker else config.chunker_instance
+    )
+    segment = text[start:end]
+    chunks = chunker.chunk(segment)
+
+    slices: list[ChunkSlice] = []
+    local = 0
+    for chunk in chunks:
+        # Use the chunker's end offset only as a cut point, then re-slice the
+        # segment ourselves so the result is an exact, gap-free substring.
+        local_end = min(max(chunk.end_index, local), len(segment))
+        if local_end <= local:
+            continue
+        slices.append(
+            ChunkSlice(segment[local:local_end], start + local, start + local_end)
+        )
+        local = local_end
+
+    if local < len(segment):
+        if slices:
+            last = slices[-1]
+            slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
+        else:
+            slices.append(ChunkSlice(segment[local:], start + local, end))
+
+    return slices

From 55491fef9d7dc18964582f00b6db20f6ecf24891 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 06/47] refactor: make embedding cache span-aware

---
 .../cache/cached_indexing.py                  | 64 +++++++++++--------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
index 95321a229..58872a219 100644
--- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
@@ -18,23 +18,26 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
 from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
 from app.indexing_pipeline.cache.service import EmbeddingCacheService
 from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
-from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
+from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.observability import metrics
 
 logger = logging.getLogger(__name__)
 
-ChunkPair = tuple[str, np.ndarray]
+SliceEmbedding = tuple[ChunkSlice, np.ndarray]
 
 
 async def build_chunk_embeddings(
     markdown: str, *, use_code_chunker: bool
-) -> tuple[np.ndarray, list[ChunkPair]]:
-    """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
+) -> tuple[np.ndarray, list[SliceEmbedding]]:
+    """Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs.
 
-    Drop-in for the inline chunk+embed step; reuses prior output when the same
-    markdown has already been embedded with the current model and chunker.
+    Slices are always recomputed (cheap) so their char spans are exact; only the
+    embeddings are cached, reused when the same markdown was embedded with the
+    current model and chunker.
     """
+    slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
+
     settings = load_embedding_cache_settings()
     chunker_kind = "code" if use_code_chunker else "hybrid"
     embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
@@ -45,7 +48,7 @@ async def build_chunk_embeddings(
         embedding_dim=embedding_dim,
     )
     if not cacheable:
-        return await _compute(markdown, use_code_chunker=use_code_chunker)
+        return await _compute(markdown, slices)
 
     key = EmbeddingKey(
         markdown_sha256=_hash_text(markdown),
@@ -56,31 +59,30 @@ async def build_chunk_embeddings(
     )
 
     cached = await _recall(key)
-    if cached is not None:
+    if cached is not None and _aligns(cached, slices):
         metrics.record_embedding_cache_lookup(
             embedding_model=key.embedding_model,
             chunker_kind=chunker_kind,
             outcome="hit",
         )
         logger.debug("Embedding cache hit for %s", key.markdown_sha256)
-        return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
+        return cached.summary_embedding, list(
+            zip(slices, (c.embedding for c in cached.chunks), strict=True)
+        )
 
     metrics.record_embedding_cache_lookup(
         embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
     )
-    summary_embedding, chunk_pairs = await _compute(
-        markdown, use_code_chunker=use_code_chunker
+    summary_embedding, pairs = await _compute(markdown, slices)
+    await _remember(key, summary_embedding, pairs)
+    return summary_embedding, pairs
+
+
+async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
+    """Chunk markdown into ordered, char-addressed slices off the event loop."""
+    return await asyncio.to_thread(
+        chunk_markdown_with_spans, markdown, use_code_chunker
     )
-    await _remember(key, summary_embedding, chunk_pairs)
-    return summary_embedding, chunk_pairs
-
-
-async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
-    """Chunk markdown into ordered texts with the pipeline's chunker selection."""
-    if use_code_chunker:
-        return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
-    # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
-    return await asyncio.to_thread(chunk_text_hybrid, markdown)
 
 
 async def embed_batch(texts: list[str]) -> list[np.ndarray]:
@@ -88,13 +90,19 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
     return await asyncio.to_thread(embed_texts, texts)
 
 
+def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
+    """A hit is only usable if its texts still match the current chunking."""
+    return len(cached.chunks) == len(slices) and all(
+        c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
+    )
+
+
 async def _compute(
-    markdown: str, *, use_code_chunker: bool
-) -> tuple[np.ndarray, list[ChunkPair]]:
-    chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
-    embeddings = await embed_batch([markdown, *chunk_texts])
+    markdown: str, slices: list[ChunkSlice]
+) -> tuple[np.ndarray, list[SliceEmbedding]]:
+    embeddings = await embed_batch([markdown, *(s.text for s in slices)])
     summary_embedding, *chunk_embeddings = embeddings
-    return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
+    return summary_embedding, list(zip(slices, chunk_embeddings, strict=True))
 
 
 async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
@@ -110,14 +118,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
 
 
 async def _remember(
-    key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
+    key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding]
 ) -> None:
     try:
         from app.tasks.celery_tasks import get_celery_session_maker
 
         embedding_set = EmbeddingSet(
             summary_embedding=summary_embedding,
-            chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
+            chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs],
         )
         async with get_celery_session_maker()() as session:
             await EmbeddingCacheService(session).remember(key, embedding_set)

From 1e33c28c246862f0df7b48c9b33f1cf3dd3b8b17 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 07/47] feat: carry char spans on existing chunks

---
 surfsense_backend/app/indexing_pipeline/chunk_reconciler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
index 9354aeb9f..dd57a44d1 100644
--- a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
+++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
@@ -19,6 +19,9 @@ class ExistingChunk:
     id: int
     content: str
     position: int
+    # Stored char span; None for legacy rows indexed before spans existed.
+    start_char: int | None = None
+    end_char: int | None = None
 
 
 @dataclass(frozen=True, slots=True)

From c57ee978e67f97d69a7aac8294e415430131d2e8 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 08/47] feat: persist and refresh chunk char spans on index

---
 .../indexing_pipeline_service.py              | 89 +++++++++++++++----
 1 file changed, 70 insertions(+), 19 deletions(-)

diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
index 30ea9d5d6..0cb74089b 100644
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@@ -20,9 +20,10 @@ from app.db import (
     DocumentType,
 )
 from app.indexing_pipeline.cache import build_chunk_embeddings
-from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
-from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
+from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch
+from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile
 from app.indexing_pipeline.connector_document import ConnectorDocument
+from app.indexing_pipeline.document_chunker import ChunkSlice
 from app.indexing_pipeline.document_hashing import (
     compute_content_hash,
     compute_identifier_hash,
@@ -489,12 +490,22 @@ class IndexingPipelineService:
 
     async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
         result = await self.session.execute(
-            select(Chunk.id, Chunk.content, Chunk.position).where(
-                Chunk.document_id == document_id
-            )
+            select(
+                Chunk.id,
+                Chunk.content,
+                Chunk.position,
+                Chunk.start_char,
+                Chunk.end_char,
+            ).where(Chunk.document_id == document_id)
         )
         return [
-            ExistingChunk(id=row.id, content=row.content, position=row.position)
+            ExistingChunk(
+                id=row.id,
+                content=row.content,
+                position=row.position,
+                start_char=row.start_char,
+                end_char=row.end_char,
+            )
             for row in result
         ]
 
@@ -505,15 +516,21 @@ class IndexingPipelineService:
             delete(Chunk).where(Chunk.document_id == document.id)
         )
 
-        summary_embedding, chunk_pairs = await build_chunk_embeddings(
+        summary_embedding, slice_pairs = await build_chunk_embeddings(
             content,
             use_code_chunker=connector_doc.should_use_code_chunker,
         )
 
         document.embedding = summary_embedding
         return [
-            Chunk(content=text, embedding=emb, position=i)
-            for i, (text, emb) in enumerate(chunk_pairs)
+            Chunk(
+                content=chunk_slice.text,
+                embedding=emb,
+                position=i,
+                start_char=chunk_slice.start_char,
+                end_char=chunk_slice.end_char,
+            )
+            for i, (chunk_slice, emb) in enumerate(slice_pairs)
         ]
 
     async def _reindex_incrementally(
@@ -525,35 +542,39 @@ class IndexingPipelineService:
     ) -> int:
         """Edit path: keep rows whose text survived, embed only new texts.
 
-        Unchanged rows keep their embedding and their HNSW/GIN index entries;
-        moved rows get a position-only UPDATE, which touches neither index.
+        Unchanged rows keep their embedding and their HNSW/GIN index entries. An
+        edit can shift a kept chunk's char span without changing its text, so
+        every kept row's position and span are refreshed whenever they drift.
         """
-        new_texts = await chunk_markdown(
+        slices = await chunk_slices(
             content, use_code_chunker=connector_doc.should_use_code_chunker
         )
+        new_texts = [s.text for s in slices]
         plan = reconcile(existing, new_texts)
 
         # One batch: the document-level summary vector plus the missing chunks.
         embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
         summary_embedding, *new_embeddings = embeddings
 
-        if plan.reused:
-            await self.session.execute(
-                update(Chunk),
-                [{"id": cid, "position": pos} for cid, pos in plan.reused],
-            )
         if plan.to_delete:
             await self.session.execute(
                 delete(Chunk).where(Chunk.id.in_(plan.to_delete))
             )
+
+        span_updates = self._kept_row_span_updates(existing, slices, plan)
+        if span_updates:
+            await self.session.execute(update(Chunk), span_updates)
+
         self.session.add_all(
             Chunk(
-                content=text,
+                content=slices[pos].text,
                 embedding=emb,
                 position=pos,
+                start_char=slices[pos].start_char,
+                end_char=slices[pos].end_char,
                 document_id=document.id,
             )
-            for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
+            for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True)
         )
         document.embedding = summary_embedding
 
@@ -564,6 +585,36 @@ class IndexingPipelineService:
         )
         return len(new_texts)
 
+    @staticmethod
+    def _kept_row_span_updates(
+        existing: list[ExistingChunk],
+        slices: list[ChunkSlice],
+        plan: ChunkPlan,
+    ) -> list[dict]:
+        """Position/span writes for kept rows, emitted only where a value drifts."""
+        deleted = set(plan.to_delete)
+        moved = dict(plan.reused)
+        updates: list[dict] = []
+        for chunk in existing:
+            if chunk.id in deleted:
+                continue
+            new_position = moved.get(chunk.id, chunk.position)
+            target = slices[new_position]
+            if (
+                chunk.position != new_position
+                or chunk.start_char != target.start_char
+                or chunk.end_char != target.end_char
+            ):
+                updates.append(
+                    {
+                        "id": chunk.id,
+                        "position": new_position,
+                        "start_char": target.start_char,
+                        "end_char": target.end_char,
+                    }
+                )
+        return updates
+
     async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
         """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
         try:

From 65b7d1b01ac5abe6627634d46a8d22380e88979a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:26 +0200
Subject: [PATCH 09/47] chore: bump embedding cache chunker version to 2

---
 surfsense_backend/app/config/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 63be54654..c8eb33b8f 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -953,8 +953,9 @@ class Config:
         os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
     )
     # Bump to invalidate every cached embedding set after a chunker change.
+    # v2: chunks became exact (raw) slices of source_markdown for citation spans.
     EMBEDDING_CACHE_CHUNKER_VERSION = int(
-        os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
+        os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2")
     )
     EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
     EMBEDDING_CACHE_MAX_TOTAL_MB = int(

From 94229213f4d653f0f24b58ead775408729ff766a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 10/47] test: cover span chunker invariants

---
 .../test_chunk_markdown_with_spans.py         | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py

diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
new file mode 100644
index 000000000..0ff155c3b
--- /dev/null
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
@@ -0,0 +1,72 @@
+"""Span-aware chunking contract: slices form a lossless, contiguous partition
+of the markdown, and every slice's char span addresses its own text."""
+
+import pytest
+
+from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
+
+pytestmark = pytest.mark.unit
+
+
+def _assert_lossless_partition(md: str, slices) -> None:
+    assert "".join(s.text for s in slices) == md
+
+    cursor = 0
+    for s in slices:
+        assert s.start_char == cursor, "slices must be contiguous"
+        assert s.end_char >= s.start_char
+        assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
+        cursor = s.end_char
+    assert cursor == len(md)
+
+
+def test_prose_partition_and_spans():
+    md = (
+        "# Title\n\n"
+        + "First paragraph with several words here. " * 20
+        + "\n\nSecond section with more prose to force multiple chunks. " * 20
+    )
+
+    slices = chunk_markdown_with_spans(md)
+
+    assert len(slices) > 1
+    _assert_lossless_partition(md, slices)
+
+
+def test_table_kept_whole_with_exact_span():
+    table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
+    md = f"Intro prose before the table.\n{table}\nClosing prose after."
+
+    slices = chunk_markdown_with_spans(md)
+
+    _assert_lossless_partition(md, slices)
+    table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
+    assert any("| 1 | 2 |" in s.text for s in table_slices)
+    for s in table_slices:
+        assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
+
+
+def test_table_at_eof_without_trailing_newline_stays_whole():
+    md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
+
+    slices = chunk_markdown_with_spans(md)
+
+    _assert_lossless_partition(md, slices)
+    table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
+    assert len(table_slices) == 1
+    assert "| a | b |" in table_slices[0].text
+
+
+def test_code_chunker_partition_and_spans():
+    code = "\n\n".join(
+        f"def func_{i}(x):\n    total = x + {i}\n    return total" for i in range(40)
+    )
+
+    slices = chunk_markdown_with_spans(code, use_code_chunker=True)
+
+    assert len(slices) >= 1
+    _assert_lossless_partition(code, slices)
+
+
+def test_empty_markdown_yields_no_slices():
+    assert chunk_markdown_with_spans("") == []

From 60fff66ee08392c0ba7a605ccd4d08741ae25dc0 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 11/47] test: verify chunk span persistence on index

---
 .../indexing_pipeline/test_index_spans.py     | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py

diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py
new file mode 100644
index 000000000..869045bf6
--- /dev/null
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py
@@ -0,0 +1,96 @@
+"""Indexing records char spans so a chunk addresses its exact slice of the body.
+
+Uses the real chunker (only embeddings are faked) so the span/partition
+invariants are exercised end to end.
+"""
+
+import pytest
+from sqlalchemy import select
+
+from app.db import Chunk, Document
+from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
+
+pytestmark = pytest.mark.integration
+
+_BODY = (
+    "# Report\n\n"
+    + "Intro paragraph that is reasonably long and descriptive. " * 8
+    + "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
+    + "Closing paragraph with a different shape and more words to chunk. " * 8
+)
+
+
+async def _ordered_chunks(session, document_id) -> list[Chunk]:
+    result = await session.execute(
+        select(Chunk)
+        .filter(Chunk.document_id == document_id)
+        .order_by(Chunk.position, Chunk.id)
+    )
+    return list(result.scalars().all())
+
+
+def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
+    for chunk in chunks:
+        assert chunk.start_char is not None and chunk.end_char is not None
+        assert body[chunk.start_char : chunk.end_char] == chunk.content
+    assert "".join(c.content for c in chunks) == body
+
+
+async def _index(session, connector_doc) -> int:
+    service = IndexingPipelineService(session=session)
+    prepared = await service.prepare_for_indexing([connector_doc])
+    document = prepared[0]
+    await service.index(document, connector_doc)
+    return document.id
+
+
+async def _reload_body(session, document_id) -> str:
+    result = await session.execute(select(Document).filter(Document.id == document_id))
+    return result.scalars().first().source_markdown
+
+
+@pytest.mark.usefixtures("patched_embed_texts")
+async def test_scratch_index_records_spans_addressing_body(
+    db_session, db_search_space, make_connector_document
+):
+    connector_doc = make_connector_document(
+        search_space_id=db_search_space.id, source_markdown=_BODY
+    )
+
+    document_id = await _index(db_session, connector_doc)
+
+    body = await _reload_body(db_session, document_id)
+    chunks = await _ordered_chunks(db_session, document_id)
+
+    assert len(chunks) > 1
+    _assert_spans_address_body(chunks, body)
+
+
+@pytest.mark.usefixtures("patched_embed_texts")
+async def test_incremental_reindex_refreshes_shifted_spans(
+    db_session, db_search_space, make_connector_document
+):
+    """Inserting text at the top shifts every later chunk's span; kept rows must
+    have their spans refreshed, not left pointing at the old offsets."""
+    service = IndexingPipelineService(session=db_session)
+
+    original = make_connector_document(
+        search_space_id=db_search_space.id, source_markdown=_BODY
+    )
+    prepared = await service.prepare_for_indexing([original])
+    document_id = prepared[0].id
+    await service.index(prepared[0], original)
+
+    edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
+    edited = make_connector_document(
+        search_space_id=db_search_space.id, source_markdown=edited_body
+    )
+    prepared_again = await service.prepare_for_indexing([edited])
+    assert prepared_again, "edited content should requeue the document"
+    await service.index(prepared_again[0], edited)
+
+    body = await _reload_body(db_session, document_id)
+    chunks = await _ordered_chunks(db_session, document_id)
+
+    assert body == edited_body
+    _assert_spans_address_body(chunks, body)

From 12e948cad18719b2104c7e7a607a3440396eda71 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 12/47] test: mock span chunker in integration fixtures

---
 surfsense_backend/tests/integration/conftest.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py
index 6b8aa3cdb..e67a025cc 100644
--- a/surfsense_backend/tests/integration/conftest.py
+++ b/surfsense_backend/tests/integration/conftest.py
@@ -158,13 +158,12 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
 
 @pytest.fixture
 def patched_chunk_text(monkeypatch) -> MagicMock:
-    mock = MagicMock(return_value=["Test chunk content."])
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    text = "Test chunk content."
+    mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
     monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text",
-        mock,
-    )
-    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
         mock,
     )
     return mock

From a7cf9bd94684bafd156bb570020ebfec153ae530 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 13/47] test: mock span chunker in reindex test

---
 .../adapters/test_file_upload_adapter.py                 | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
index 814129c8d..e89d7592b 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
@@ -176,9 +176,14 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
 @pytest.mark.usefixtures("patched_embed_texts")
 async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
     """Reindexing replaces old chunks with new content rather than appending."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
     mocker.patch(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
-        side_effect=[["Original chunk."], ["Updated chunk."]],
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        side_effect=[
+            [ChunkSlice("Original chunk.", 0, len("Original chunk."))],
+            [ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
+        ],
     )
 
     adapter = UploadDocumentAdapter(db_session)

From 03012c307729ee98e0527ba56c1f08d67590bc49 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 14/47] test: span-aware paragraph chunker fixture

---
 .../indexing_pipeline/test_index_editions.py  | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
index 68d5ec0af..f86ee8e4f 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
@@ -18,16 +18,22 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
 
 @pytest.fixture
 def paragraph_chunker(monkeypatch):
-    """One chunk per markdown paragraph, so edits map to chunk-level diffs."""
+    """One slice per markdown paragraph, so edits map to chunk-level diffs."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice
 
-    def _split(markdown, **_kwargs):
-        return [p for p in markdown.split("\n\n") if p.strip()]
+    def _split(markdown, *_args, **_kwargs):
+        slices = []
+        cursor = 0
+        for para in markdown.split("\n\n"):
+            start = markdown.index(para, cursor)
+            cursor = start + len(para)
+            if para.strip():
+                slices.append(ChunkSlice(para, start, cursor))
+        return slices
 
     monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
-    )
-    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        _split,
     )
 
 

From a0046483a904bfc26109ff83a371981451973cd0 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:06:33 +0200
Subject: [PATCH 15/47] test: assert chunker routing via use_code_chunker flag

---
 .../test_index_batch_parallel.py              | 89 ++++++++++++-------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
index feb7bbc52..8c4936648 100644
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
@@ -37,12 +37,9 @@ def _make_orm_doc(connector_doc, doc_id):
 async def test_index_calls_embed_and_chunk_via_to_thread(
     pipeline, make_connector_document, monkeypatch
 ):
-    """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
+    """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice
 
-    Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
-    path, see issue #1334) is verified separately in
-    ``test_non_code_documents_use_hybrid_chunker``.
-    """
     to_thread_calls = []
     original_to_thread = asyncio.to_thread
 
@@ -51,11 +48,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
         return await original_to_thread(func, *args, **kwargs)
 
     monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
-    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
-    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
+    mock_chunker.__name__ = "chunk_markdown_with_spans"
     monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
-        mock_chunk_hybrid,
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
     )
     mock_embed = MagicMock(
         side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@@ -90,34 +87,25 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
 
     await pipeline.index(document, connector_doc)
 
-    # Either chunker entry point satisfies the "chunking runs off the event
-    # loop" contract this test guards. Routing between the two is verified
-    # in test_non_code_documents_use_hybrid_chunker.
-    assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
+    assert "chunk_markdown_with_spans" in to_thread_calls
     assert "embed_texts" in to_thread_calls
     assert document.status == DocumentStatus.ready()
 
 
-async def test_non_code_documents_use_hybrid_chunker(
+async def test_non_code_documents_use_prose_chunker(
     pipeline, make_connector_document, monkeypatch
 ):
-    """Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
+    """Non-code documents chunk with use_code_chunker=False (issue #1334).
 
-    The hybrid chunker preserves Markdown table integrity by avoiding splits
-    mid-row. Only documents flagged with ``should_use_code_chunker=True``
-    should take the ``chunk_text`` path.
+    The table-aware prose path keeps Markdown tables intact; only documents
+    flagged with ``should_use_code_chunker=True`` request the code chunker.
     """
-    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
-    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
     monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
-        mock_chunk_hybrid,
-    )
-    mock_chunk_code = MagicMock(return_value=["chunk1"])
-    mock_chunk_code.__name__ = "chunk_text"
-    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text",
-        mock_chunk_code,
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
     )
     monkeypatch.setattr(
         "app.indexing_pipeline.cache.cached_indexing.embed_texts",
@@ -149,8 +137,49 @@ async def test_non_code_documents_use_hybrid_chunker(
 
     await pipeline.index(document, connector_doc)
 
-    mock_chunk_hybrid.assert_called_once()
-    mock_chunk_code.assert_not_called()
+    mock_chunker.assert_called_once()
+    assert mock_chunker.call_args.args[1] is False
+
+
+async def test_code_documents_request_code_chunker(
+    pipeline, make_connector_document, monkeypatch
+):
+    """Code-flagged documents forward use_code_chunker=True to the chunker."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
+    monkeypatch.setattr(
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
+    )
+    monkeypatch.setattr(
+        "app.indexing_pipeline.cache.cached_indexing.embed_texts",
+        MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
+    )
+    monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
+
+    async def _noop_persist(_session, doc, *_args, **_kwargs):
+        doc.status = DocumentStatus.ready()
+
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
+        _noop_persist,
+    )
+
+    connector_doc = make_connector_document(
+        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+        unique_id="repo-1",
+        search_space_id=1,
+        should_use_code_chunker=True,
+    )
+    document = MagicMock(spec=Document)
+    document.id = 1
+    document.status = DocumentStatus.pending()
+
+    await pipeline.index(document, connector_doc)
+
+    mock_chunker.assert_called_once()
+    assert mock_chunker.call_args.args[1] is True
 
 
 def _mock_session_factory(orm_docs_by_id):

From f2fe2e576efbf399c18e87653a645c55d6f6a540 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:17:45 +0200
Subject: [PATCH 16/47] feat: note writes chunk via shared span builder

---
 .../middleware/kb_persistence/middleware.py   | 70 +++++++++----------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
index a6c83a7d4..d66e9073c 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
@@ -18,7 +18,6 @@ skipped (e.g. client disconnect).
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from datetime import UTC, datetime
 from typing import Any
@@ -58,9 +57,8 @@ from app.db import (
     FolderRevision,
     shielded_async_session,
 )
-from app.indexing_pipeline.document_chunker import chunk_text
+from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
 from app.utils.document_converters import (
-    embed_texts,
     generate_content_hash,
     generate_unique_identifier_hash,
 )
@@ -234,24 +232,23 @@ async def _create_document(
     session.add(doc)
     await session.flush()
 
-    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
+    summary_embedding, chunk_embeddings = await build_chunk_embeddings(
+        content, use_code_chunker=False
+    )
     doc.embedding = summary_embedding
-    chunks = chunk_text(content)
-    if chunks:
-        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
-        session.add_all(
-            [
-                Chunk(
-                    document_id=doc.id,
-                    content=text,
-                    embedding=embedding,
-                    position=i,
-                )
-                for i, (text, embedding) in enumerate(
-                    zip(chunks, chunk_embeddings, strict=True)
-                )
-            ]
-        )
+    session.add_all(
+        [
+            Chunk(
+                document_id=doc.id,
+                content=sl.text,
+                embedding=embedding,
+                position=i,
+                start_char=sl.start_char,
+                end_char=sl.end_char,
+            )
+            for i, (sl, embedding) in enumerate(chunk_embeddings)
+        ]
+    )
     return doc
 
 
@@ -287,26 +284,25 @@ async def _update_document(
         search_space_id,
     )
 
-    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
+    summary_embedding, chunk_embeddings = await build_chunk_embeddings(
+        content, use_code_chunker=False
+    )
     document.embedding = summary_embedding
 
     await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
-    chunks = chunk_text(content)
-    if chunks:
-        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
-        session.add_all(
-            [
-                Chunk(
-                    document_id=document.id,
-                    content=text,
-                    embedding=embedding,
-                    position=i,
-                )
-                for i, (text, embedding) in enumerate(
-                    zip(chunks, chunk_embeddings, strict=True)
-                )
-            ]
-        )
+    session.add_all(
+        [
+            Chunk(
+                document_id=document.id,
+                content=sl.text,
+                embedding=embedding,
+                position=i,
+                start_char=sl.start_char,
+                end_char=sl.end_char,
+            )
+            for i, (sl, embedding) in enumerate(chunk_embeddings)
+        ]
+    )
     return document
 
 

From 5ed62e712b473a6d33a9dca799b67b3ca8cd32d2 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:17:45 +0200
Subject: [PATCH 17/47] test: stub build_chunk_embeddings in parity tests

---
 .../test_kb_persistence_filesystem_parity.py  | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
index e78db1e76..3968eb090 100644
--- a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
+++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
@@ -69,13 +69,25 @@ class _FakeSession:
 
 @pytest.fixture(autouse=True)
 def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Avoid loading the embedding model in unit tests."""
+    """Avoid loading the embedding model in unit tests.
+
+    Mirrors the legacy stub: one chunk spanning the whole content, with a
+    zero summary/chunk vector, routed through the shared span builder.
+    """
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
+        summary = np.zeros(8, dtype=np.float32)
+        pairs = (
+            [(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
+            if content
+            else []
+        )
+        return summary, pairs
+
     monkeypatch.setattr(
-        kb_persistence,
-        "embed_texts",
-        lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
+        kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
     )
-    monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
 
 
 @pytest.mark.asyncio

From 5a315eafd338f14ea870bfc8db6f8a5164d8c8ab Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Thu, 18 Jun 2026 20:17:45 +0200
Subject: [PATCH 18/47] test: verify note write chunk spans

---
 .../test_kb_persistence_spans.py              | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py

diff --git a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py
new file mode 100644
index 000000000..77e2e5f18
--- /dev/null
+++ b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py
@@ -0,0 +1,80 @@
+"""NOTE writes must carry the same char spans as the indexing pipeline.
+
+``_create_document`` / ``_update_document`` are the cloud agent's KB write
+paths. They must chunk through the shared span chunker so every persisted
+chunk resolves back to an exact slice of ``source_markdown`` for citations.
+"""
+
+from __future__ import annotations
+
+import pytest
+from sqlalchemy import select
+
+from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
+    middleware as kb,
+)
+from app.db import Chunk
+
+pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
+
+_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
+_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
+
+
+async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
+    rows = await session.execute(
+        select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
+    )
+    return list(rows.scalars().all())
+
+
+def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
+    assert chunks
+    for chunk in chunks:
+        assert chunk.start_char is not None
+        assert chunk.end_char is not None
+        assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
+
+
+@pytest.mark.usefixtures("patched_embed_texts")
+async def test_note_create_populates_chunk_spans(
+    db_session, db_search_space, db_user
+) -> None:
+    doc = await kb._create_document(
+        db_session,
+        virtual_path="/documents/note.md",
+        content=_BODY,
+        search_space_id=db_search_space.id,
+        created_by_id=str(db_user.id),
+    )
+    await db_session.flush()
+
+    chunks = await _ordered_chunks(db_session, doc.id)
+    _assert_spans_resolve(doc.source_markdown, chunks)
+
+
+@pytest.mark.usefixtures("patched_embed_texts")
+async def test_note_update_refreshes_chunk_spans(
+    db_session, db_search_space, db_user
+) -> None:
+    doc = await kb._create_document(
+        db_session,
+        virtual_path="/documents/note.md",
+        content=_BODY,
+        search_space_id=db_search_space.id,
+        created_by_id=str(db_user.id),
+    )
+    await db_session.flush()
+
+    updated = await kb._update_document(
+        db_session,
+        doc_id=doc.id,
+        content=_NEW_BODY,
+        virtual_path="/documents/note.md",
+        search_space_id=db_search_space.id,
+    )
+    await db_session.flush()
+
+    assert updated is not None
+    chunks = await _ordered_chunks(db_session, updated.id)
+    _assert_spans_resolve(updated.source_markdown, chunks)

From 0f32b35d3eb8f6142cd426ea6a87bdcd1e21a583 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 19/47] feat: add char-span to line-range helper

---
 surfsense_backend/app/utils/text_spans.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 surfsense_backend/app/utils/text_spans.py

diff --git a/surfsense_backend/app/utils/text_spans.py b/surfsense_backend/app/utils/text_spans.py
new file mode 100644
index 000000000..c12201174
--- /dev/null
+++ b/surfsense_backend/app/utils/text_spans.py
@@ -0,0 +1,23 @@
+"""Convert char spans into document-relative line ranges.
+
+Chunks store half-open char spans into ``source_markdown``; citations and the
+editor speak in line numbers. This is the single shared conversion so search,
+the resolve API, and highlighting all agree on what "lines X-Y" means.
+"""
+
+from __future__ import annotations
+
+
+def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
+    """Return the 1-based inclusive line range covering ``[start_char, end_char)``.
+
+    Offsets are clamped to ``text`` bounds. An empty span resolves to the single
+    line containing it.
+    """
+    n = len(text)
+    start = max(0, min(start_char, n))
+    end = max(start, min(end_char, n))
+    start_line = text.count("\n", 0, start) + 1
+    last_char_index = max(start, end - 1)
+    end_line = text.count("\n", 0, last_char_index) + 1
+    return start_line, end_line

From 90502d21d38425c1437d4d09ff49c88ad4a7cb10 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 20/47] test: cover char-span line-range helper

---
 .../tests/unit/utils/test_text_spans.py       | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/utils/test_text_spans.py

diff --git a/surfsense_backend/tests/unit/utils/test_text_spans.py b/surfsense_backend/tests/unit/utils/test_text_spans.py
new file mode 100644
index 000000000..d70418ea5
--- /dev/null
+++ b/surfsense_backend/tests/unit/utils/test_text_spans.py
@@ -0,0 +1,39 @@
+"""Unit tests for char-span -> line-range conversion."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.utils.text_spans import char_span_to_line_range
+
+pytestmark = pytest.mark.unit
+
+_TEXT = "line1\nline2\nline3"
+
+
+def test_single_line_span() -> None:
+    start = _TEXT.index("line2")
+    assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
+
+
+def test_first_line_span() -> None:
+    assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
+
+
+def test_last_line_span() -> None:
+    start = _TEXT.index("line3")
+    assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
+
+
+def test_multi_line_span() -> None:
+    # "line1\nline2" spans lines 1-2.
+    assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
+
+
+def test_empty_span_resolves_to_its_line() -> None:
+    start = _TEXT.index("line2")
+    assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
+
+
+def test_offsets_clamped_to_text_bounds() -> None:
+    assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)

From 04b679e2bfa34313e8d97cc104bd137985675252 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 21/47] feat: return chunk char spans from hybrid search

---
 .../app/retriever/chunks_hybrid_search.py      | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py
index 5e5edec2e..adce14e53 100644
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@@ -440,8 +440,15 @@ class ChucksHybridSearchRetriever:
             chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
 
         # Select only the columns we need (skip Chunk.embedding ~12KB/row).
+        # start_char/end_char carry the citation span; None for legacy rows.
         chunk_query = (
-            select(Chunk.id, Chunk.content, Chunk.document_id)
+            select(
+                Chunk.id,
+                Chunk.content,
+                Chunk.document_id,
+                Chunk.start_char,
+                Chunk.end_char,
+            )
             .join(numbered, Chunk.id == numbered.c.chunk_id)
             .where(chunk_filter)
             .order_by(Chunk.document_id, Chunk.position, Chunk.id)
@@ -476,7 +483,14 @@ class ChucksHybridSearchRetriever:
             if doc_id not in doc_map:
                 continue
             doc_entry = doc_map[doc_id]
-            doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
+            doc_entry["chunks"].append(
+                {
+                    "chunk_id": row.id,
+                    "content": row.content,
+                    "start_char": row.start_char,
+                    "end_char": row.end_char,
+                }
+            )
             if row.id in matched_chunk_ids:
                 doc_entry["matched_chunk_ids"].append(row.id)
 

From c376fbaf611d7b6a11c8303fe354edd5ea080066 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 22/47] test: seed chunk spans in retriever fixture

---
 .../tests/integration/retriever/conftest.py          | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/tests/integration/retriever/conftest.py b/surfsense_backend/tests/integration/retriever/conftest.py
index d2443723c..96c6297bb 100644
--- a/surfsense_backend/tests/integration/retriever/conftest.py
+++ b/surfsense_backend/tests/integration/retriever/conftest.py
@@ -40,11 +40,19 @@ def _make_document(
     )
 
 
-def _make_chunk(*, content: str, document_id: int) -> Chunk:
+def _make_chunk(
+    *,
+    content: str,
+    document_id: int,
+    start_char: int | None = None,
+    end_char: int | None = None,
+) -> Chunk:
     return Chunk(
         content=content,
         document_id=document_id,
         embedding=DUMMY_EMBEDDING,
+        start_char=start_char,
+        end_char=end_char,
     )
 
 
@@ -91,6 +99,8 @@ async def seed_large_doc(
         _make_chunk(
             content="quarterly performance review summary note content",
             document_id=small_doc.id,
+            start_char=0,
+            end_char=10,
         ),
     ]
 

From a2a92c592f937f4d436ac69b192cd1a495bf9108 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 23/47] test: assert hybrid search returns chunk spans

---
 .../test_optimized_chunk_retriever.py         | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
index f80e59304..a8c85e65f 100644
--- a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
+++ b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
@@ -98,6 +98,32 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
         assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
 
 
+async def test_chunk_spans_returned(db_session, seed_large_doc):
+    """Each chunk dict carries start_char/end_char (the citation span)."""
+    space_id = seed_large_doc["search_space"].id
+    small_doc_id = seed_large_doc["small_doc"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review summary",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    for result in results:
+        for chunk in result["chunks"]:
+            assert "start_char" in chunk
+            assert "end_char" in chunk
+        if result["document"].get("id") == small_doc_id:
+            seeded = result["chunks"][0]
+            assert seeded["start_char"] == 0
+            assert seeded["end_char"] == 10
+            break
+    else:
+        pytest.fail("Small doc not found in search results")
+
+
 async def test_score_is_positive_float(db_session, seed_large_doc):
     """Each result should have a positive float score from RRF."""
     space_id = seed_large_doc["search_space"].id

From 7967b62b42f8d2bed16b79bb8f6a33508d9b0ce9 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 24/47] feat: search tool renders matched passage with lines

---
 .../main_agent/tools/search_knowledge_base.py | 103 ++++++++++++++----
 1 file changed, 83 insertions(+), 20 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
index 9236e9121..ad47816f9 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
@@ -33,6 +33,7 @@ from app.agents.chat.runtime.path_resolver import (
 )
 from app.db import Document, shielded_async_session
 from app.utils.perf import get_perf_logger
+from app.utils.text_spans import char_span_to_line_range
 
 _perf_log = get_perf_logger()
 
@@ -56,12 +57,16 @@ _TOOL_DESCRIPTION = (
 )
 
 
-async def _resolve_virtual_paths(
+async def _resolve_doc_context(
     results: list[dict[str, Any]],
     *,
     search_space_id: int,
-) -> dict[int, str]:
-    """Resolve ``Document.id`` -> canonical virtual path for the search hits."""
+) -> tuple[dict[int, str], dict[int, str]]:
+    """Resolve ``Document.id`` -> (canonical virtual path, source_markdown).
+
+    ``source_markdown`` is the canonical body the chunk spans index into; the
+    renderer uses it to turn a chunk's char span into a line range.
+    """
     doc_ids = [
         doc_id
         for doc_id in (
@@ -72,17 +77,24 @@ async def _resolve_virtual_paths(
         if isinstance(doc_id, int)
     ]
     if not doc_ids:
-        return {}
+        return {}, {}
 
     async with shielded_async_session() as session:
         index: PathIndex = await build_path_index(session, search_space_id)
-        folder_rows = await session.execute(
-            select(Document.id, Document.folder_id).where(
+        rows = await session.execute(
+            select(
+                Document.id, Document.folder_id, Document.source_markdown
+            ).where(
                 Document.search_space_id == search_space_id,
                 Document.id.in_(doc_ids),
             )
         )
-        folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
+        folder_by_doc_id: dict[int, int | None] = {}
+        bodies: dict[int, str] = {}
+        for row in rows.all():
+            folder_by_doc_id[row.id] = row.folder_id
+            if row.source_markdown:
+                bodies[row.id] = row.source_markdown
 
     paths: dict[int, str] = {}
     for doc in results:
@@ -97,13 +109,69 @@ async def _resolve_virtual_paths(
             folder_id=folder_id if isinstance(folder_id, int) else None,
             index=index,
         )
-    return paths
+    return paths, bodies
+
+
+def _line_label(chunk: dict[str, Any], body: str | None) -> str:
+    """``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
+    start = chunk.get("start_char")
+    end = chunk.get("end_char")
+    if not body or not isinstance(start, int) or not isinstance(end, int):
+        return ""
+    start_line, end_line = char_span_to_line_range(body, start, end)
+    if start_line == end_line:
+        return f"[line {start_line}]"
+    return f"[lines {start_line}-{end_line}]"
+
+
+def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
+    """Render one matched chunk as an indented, line-annotated passage."""
+    content = (chunk.get("content") or "").strip()
+    if not content:
+        return None
+    snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
+    if len(content) > _PER_DOC_SNIPPET_CHARS:
+        snippet += " ..."
+    indented = snippet.replace("\n", "\n   ")
+    label = _line_label(chunk, body)
+    head = f"\n   {label}" if label else ""
+    return f"{head}\n   {indented}"
+
+
+def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
+    """Render the RRF-matched chunks; '' when none can be rendered."""
+    by_id = {
+        c.get("chunk_id"): c
+        for c in (doc.get("chunks") or [])
+        if isinstance(c, dict)
+    }
+    rendered: list[str] = []
+    for chunk_id in doc.get("matched_chunk_ids") or []:
+        chunk = by_id.get(chunk_id)
+        if chunk is None:
+            continue
+        passage = _render_passage(chunk, body)
+        if passage:
+            rendered.append(passage)
+    return "".join(rendered)
+
+
+def _fallback_snippet(doc: dict[str, Any]) -> str:
+    """Top-of-document preview, used only when no matched chunk is available."""
+    content = (doc.get("content") or "").strip()
+    if not content:
+        return "\n   (no preview available; read the document for details)"
+    snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
+    if len(content) > _PER_DOC_SNIPPET_CHARS:
+        snippet += " ..."
+    return "\n   " + snippet.replace("\n", "\n   ")
 
 
 def _format_hits(
     results: list[dict[str, Any]],
     *,
     paths: dict[int, str],
+    bodies: dict[int, str],
     query: str,
 ) -> str:
     """Render search hits as a compact, model-readable block."""
@@ -124,21 +192,14 @@ def _format_hits(
         score = doc.get("score")
         score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
         path = paths.get(doc_id) if isinstance(doc_id, int) else None
+        body = bodies.get(doc_id) if isinstance(doc_id, int) else None
 
         header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
             f"\n   path: {path}" if path else ""
         )
 
-        content = (doc.get("content") or "").strip()
-        if content:
-            snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
-            if len(content) > _PER_DOC_SNIPPET_CHARS:
-                snippet += " ..."
-            body = "\n   " + snippet.replace("\n", "\n   ")
-        else:
-            body = "\n   (no preview available; read the document for details)"
-
-        entry = header + body
+        passages = _matched_passages(doc, body)
+        entry = header + (passages or _fallback_snippet(doc))
         if total + len(entry) > _MAX_TOTAL_CHARS:
             lines.append("\n<!-- additional matches truncated to fit context -->")
             break
@@ -204,8 +265,10 @@ def create_search_knowledge_base_tool(
             top_k=clamped_top_k,
         )
 
-        paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
-        rendered = _format_hits(results, paths=paths, query=cleaned_query)
+        paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id)
+        rendered = _format_hits(
+            results, paths=paths, bodies=bodies, query=cleaned_query
+        )
         matched = _matched_chunk_ids(results)
 
         _perf_log.info(

From 7d7cb12a430ba79093cd1878c61e05fda382901f Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 14:53:49 +0200
Subject: [PATCH 25/47] test: cover matched-passage hit rendering

---
 .../tools/test_search_knowledge_base.py       | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py

diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
new file mode 100644
index 000000000..eadfcd30d
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
@@ -0,0 +1,79 @@
+"""Unit tests for search_knowledge_base hit rendering.
+
+The tool must surface the passage that actually matched (the RRF-ranked
+chunk), not the top of the document, and annotate it with its line range
+when the chunk carries a char span.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
+    _format_hits,
+)
+
+pytestmark = pytest.mark.unit
+
+_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
+
+
+def _hit() -> dict:
+    intro = "Intro paragraph."
+    matched = "Matched passage here."
+    matched_start = _BODY.index(matched)
+    return {
+        "document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
+        "score": 0.42,
+        "content": _BODY.replace("\n\n", "\n\n"),
+        "matched_chunk_ids": [102],
+        "chunks": [
+            {
+                "chunk_id": 101,
+                "content": intro,
+                "start_char": 0,
+                "end_char": len(intro),
+            },
+            {
+                "chunk_id": 102,
+                "content": matched,
+                "start_char": matched_start,
+                "end_char": matched_start + len(matched),
+            },
+        ],
+    }
+
+
+def test_renders_matched_passage_not_top_of_doc() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Matched passage here." in out
+    # The intro chunk was not matched, so it must not be shown as the snippet.
+    assert "Intro paragraph." not in out
+
+
+def test_includes_line_range_when_spans_present() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    # "Matched passage here." sits on line 3 of the body.
+    assert "line 3" in out
+
+
+def test_omits_line_range_when_spans_absent() -> None:
+    hit = _hit()
+    for chunk in hit["chunks"]:
+        chunk["start_char"] = None
+        chunk["end_char"] = None
+    out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Matched passage here." in out
+    assert "[line" not in out
+
+
+def test_falls_back_to_content_when_no_matched_ids() -> None:
+    hit = _hit()
+    hit["matched_chunk_ids"] = []
+    out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Intro paragraph." in out
+
+
+def test_no_results_message() -> None:
+    out = _format_hits([], paths={}, bodies={}, query="missing")
+    assert "No knowledge-base matches" in out

From 435b84215e0278965e593998cee0f7af5ededa02 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 26/47] feat: expose chunk char spans on ChunkRead

---
 surfsense_backend/app/schemas/chunks.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py
index 7fec0d445..685aa4762 100644
--- a/surfsense_backend/app/schemas/chunks.py
+++ b/surfsense_backend/app/schemas/chunks.py
@@ -17,4 +17,7 @@ class ChunkUpdate(ChunkBase):
 
 
 class ChunkRead(ChunkBase, IDModel, TimestampModel):
+    start_char: int | None = None
+    end_char: int | None = None
+
     model_config = ConfigDict(from_attributes=True)

From ea32b62f8246911628d0ee9865bb5cba38ea74fb Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 27/47] feat: add cited line range to by-chunk response

---
 surfsense_backend/app/schemas/documents.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index 49d2836b2..162dd6882 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -73,6 +73,10 @@ class DocumentWithChunksRead(DocumentRead):
     chunks: list[ChunkRead] = []
     total_chunks: int = 0
     chunk_start_index: int = 0
+    # 1-based inclusive line range of the cited chunk within source_markdown;
+    # None when the chunk predates char spans or the body is unavailable.
+    cited_start_line: int | None = None
+    cited_end_line: int | None = None
 
     model_config = ConfigDict(from_attributes=True)
 

From f67c6607d60a27427fc715980e0a2b30c8cde37a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 28/47] feat: by-chunk resolve derives cited line range

---
 .../app/routes/documents_routes.py            | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 53f03a0ca..ea6b0d4fa 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -37,6 +37,7 @@ from app.schemas import (
 from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
 from app.users import current_active_user
 from app.utils.rbac import check_permission
+from app.utils.text_spans import char_span_to_line_range
 
 try:
     asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -967,9 +968,12 @@ async def get_document_by_chunk_id(
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
-    """
-    Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
-    Uses SQL-level pagination to avoid loading all chunks into memory.
+    """Resolve a chunk id to its document plus a window of surrounding chunks.
+
+    Returns the cited chunk's 1-based line range (cited_start_line/
+    cited_end_line) when char spans exist, so callers can anchor the citation
+    to exact source lines. Uses SQL-level pagination to avoid loading all
+    chunks into memory.
     """
     try:
         from sqlalchemy import and_, func, or_
@@ -1033,6 +1037,17 @@ async def get_document_by_chunk_id(
         )
         windowed_chunks = windowed_result.scalars().all()
 
+        cited_start_line: int | None = None
+        cited_end_line: int | None = None
+        if (
+            chunk.start_char is not None
+            and chunk.end_char is not None
+            and document.source_markdown
+        ):
+            cited_start_line, cited_end_line = char_span_to_line_range(
+                document.source_markdown, chunk.start_char, chunk.end_char
+            )
+
         return DocumentWithChunksRead(
             id=document.id,
             title=document.title,
@@ -1047,6 +1062,8 @@ async def get_document_by_chunk_id(
             chunks=windowed_chunks,
             total_chunks=total_chunks,
             chunk_start_index=start,
+            cited_start_line=cited_start_line,
+            cited_end_line=cited_end_line,
         )
     except HTTPException:
         raise

From 773f913f06f299c12962722eae302ccd74904027 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 29/47] test: cover by-chunk span and line-range resolve

---
 .../test_documents_by_chunk_route.py          | 127 ++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 surfsense_backend/tests/integration/test_documents_by_chunk_route.py

diff --git a/surfsense_backend/tests/integration/test_documents_by_chunk_route.py b/surfsense_backend/tests/integration/test_documents_by_chunk_route.py
new file mode 100644
index 000000000..f59c65d97
--- /dev/null
+++ b/surfsense_backend/tests/integration/test_documents_by_chunk_route.py
@@ -0,0 +1,127 @@
+"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
+derives the cited chunk's line range from source_markdown."""
+
+import pytest
+import pytest_asyncio
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
+
+pytestmark = pytest.mark.integration
+
+_BODY = "alpha\nbravo\ncharlie\ndelta"
+
+
+async def _make_document(
+    session: AsyncSession,
+    search_space: SearchSpace,
+    user: User,
+    *,
+    source_markdown: str = _BODY,
+) -> Document:
+    doc = Document(
+        title="Doc",
+        document_type=DocumentType.FILE,
+        document_metadata={},
+        content=source_markdown,
+        content_hash="hash-by-chunk",
+        source_markdown=source_markdown,
+        search_space_id=search_space.id,
+        created_by_id=user.id,
+        status=DocumentStatus.ready(),
+    )
+    session.add(doc)
+    await session.flush()
+    return doc
+
+
+async def _add_chunk(
+    session: AsyncSession,
+    document: Document,
+    *,
+    content: str,
+    position: int,
+    start_char: int | None,
+    end_char: int | None,
+) -> Chunk:
+    chunk = Chunk(
+        content=content,
+        position=position,
+        document_id=document.id,
+        start_char=start_char,
+        end_char=end_char,
+    )
+    session.add(chunk)
+    await session.flush()
+    return chunk
+
+
+@pytest_asyncio.fixture
+async def make_document(db_session, db_search_space, db_user):
+    async def _make(**overrides):
+        return await _make_document(db_session, db_search_space, db_user, **overrides)
+
+    return _make
+
+
+async def test_cited_line_range_derived_from_spans(
+    db_session, db_search_space, db_user, make_document
+):
+    from app.routes.documents_routes import get_document_by_chunk_id
+
+    doc = await make_document()
+    await _add_chunk(
+        db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
+    )
+    cited = await _add_chunk(
+        db_session,
+        doc,
+        content="charlie\ndelta",
+        position=1,
+        start_char=12,
+        end_char=len(_BODY),
+    )
+
+    result = await get_document_by_chunk_id(
+        cited.id, chunk_window=5, session=db_session, user=db_user
+    )
+
+    assert result.cited_start_line == 3
+    assert result.cited_end_line == 4
+
+
+async def test_chunk_spans_exposed_in_response(
+    db_session, db_search_space, db_user, make_document
+):
+    from app.routes.documents_routes import get_document_by_chunk_id
+
+    doc = await make_document()
+    cited = await _add_chunk(
+        db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
+    )
+
+    result = await get_document_by_chunk_id(
+        cited.id, chunk_window=5, session=db_session, user=db_user
+    )
+
+    chunk = next(c for c in result.chunks if c.id == cited.id)
+    assert chunk.start_char == 0
+    assert chunk.end_char == 12
+
+
+async def test_cited_line_range_null_without_spans(
+    db_session, db_search_space, db_user, make_document
+):
+    from app.routes.documents_routes import get_document_by_chunk_id
+
+    doc = await make_document()
+    cited = await _add_chunk(
+        db_session, doc, content="alpha", position=0, start_char=None, end_char=None
+    )
+
+    result = await get_document_by_chunk_id(
+        cited.id, chunk_window=5, session=db_session, user=db_user
+    )
+
+    assert result.cited_start_line is None
+    assert result.cited_end_line is None

From fc0f9d8f81b749432b0d3be0a7f2ad27a78ff070 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 30/47] feat: carry chunk spans and cited lines in contract

---
 surfsense_web/contracts/types/document.types.ts | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index da1dac537..a7fa19e18 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -70,10 +70,15 @@ export const documentWithChunks = document.extend({
 			id: z.number(),
 			content: z.string(),
 			created_at: z.string(),
+			start_char: z.number().nullable().optional(),
+			end_char: z.number().nullable().optional(),
 		})
 	),
 	total_chunks: z.number().optional().default(0),
 	chunk_start_index: z.number().optional().default(0),
+	// 1-based inclusive line range of the cited chunk within source_markdown.
+	cited_start_line: z.number().nullable().optional(),
+	cited_end_line: z.number().nullable().optional(),
 });
 
 /**

From 176ada4f4f5f146f69eebdb5cb73b927e2b575b1 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 31/47] feat: editor panel accepts citation line anchor

---
 .../atoms/editor/editor-panel.atom.ts         | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/atoms/editor/editor-panel.atom.ts b/surfsense_web/atoms/editor/editor-panel.atom.ts
index c302c66ee..ee609f519 100644
--- a/surfsense_web/atoms/editor/editor-panel.atom.ts
+++ b/surfsense_web/atoms/editor/editor-panel.atom.ts
@@ -1,6 +1,11 @@
 import { atom } from "jotai";
 import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
 
+export interface EditorLineRange {
+	start: number;
+	end: number;
+}
+
 interface EditorPanelState {
 	isOpen: boolean;
 	kind: "document" | "local_file" | "memory";
@@ -9,6 +14,10 @@ interface EditorPanelState {
 	searchSpaceId: number | null;
 	memoryScope: "user" | "team" | null;
 	title: string | null;
+	// Citation line anchor: when set, the editor opens the raw source view
+	// scrolled to and highlighting this 1-based inclusive line range.
+	highlightLines: EditorLineRange | null;
+	forceSourceView: boolean;
 }
 
 const initialState: EditorPanelState = {
@@ -19,6 +28,8 @@ const initialState: EditorPanelState = {
 	searchSpaceId: null,
 	memoryScope: null,
 	title: null,
+	highlightLines: null,
+	forceSourceView: false,
 };
 
 export const editorPanelAtom = atom<EditorPanelState>(initialState);
@@ -33,7 +44,14 @@ export const openEditorPanelAtom = atom(
 		get,
 		set,
 		payload:
-			| { documentId: number; searchSpaceId: number; title?: string; kind?: "document" }
+			| {
+					documentId: number;
+					searchSpaceId: number;
+					title?: string;
+					kind?: "document";
+					highlightLines?: EditorLineRange | null;
+					forceSourceView?: boolean;
+			  }
 			| {
 					kind: "local_file";
 					localFilePath: string;
@@ -59,6 +77,8 @@ export const openEditorPanelAtom = atom(
 				searchSpaceId: payload.searchSpaceId ?? null,
 				memoryScope: null,
 				title: payload.title ?? null,
+				highlightLines: null,
+				forceSourceView: false,
 			});
 			set(rightPanelTabAtom, "editor");
 			set(rightPanelCollapsedAtom, false);
@@ -73,6 +93,8 @@ export const openEditorPanelAtom = atom(
 				searchSpaceId: payload.searchSpaceId ?? null,
 				memoryScope: payload.memoryScope,
 				title: payload.title ?? null,
+				highlightLines: null,
+				forceSourceView: false,
 			});
 			set(rightPanelTabAtom, "editor");
 			set(rightPanelCollapsedAtom, false);
@@ -86,6 +108,8 @@ export const openEditorPanelAtom = atom(
 			searchSpaceId: payload.searchSpaceId,
 			memoryScope: null,
 			title: payload.title ?? null,
+			highlightLines: payload.highlightLines ?? null,
+			forceSourceView: payload.forceSourceView ?? false,
 		});
 		set(rightPanelTabAtom, "editor");
 		set(rightPanelCollapsedAtom, false);

From 86f8fc053071eb086be6f020d86532126644462c Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 32/47] feat: citation panel shows cited line range

---
 .../citation-panel/citation-panel.tsx           | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/citation-panel/citation-panel.tsx b/surfsense_web/components/citation-panel/citation-panel.tsx
index 890ac11ac..9b9a9aaa9 100644
--- a/surfsense_web/components/citation-panel/citation-panel.tsx
+++ b/surfsense_web/components/citation-panel/citation-panel.tsx
@@ -46,6 +46,13 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
 
 	const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
 
+	const citedLineLabel = useMemo(() => {
+		const start = data?.cited_start_line;
+		const end = data?.cited_end_line;
+		if (start == null || end == null) return null;
+		return start === end ? `Line ${start}` : `Lines ${start}–${end}`;
+	}, [data?.cited_start_line, data?.cited_end_line]);
+
 	const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
 	const startIndex = data?.chunk_start_index ?? 0;
 	const hasMoreAbove = startIndex > 0;
@@ -75,10 +82,15 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
 
 	const handleOpenFullDocument = () => {
 		if (!data) return;
+		const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null;
 		openEditorPanel({
 			documentId: data.id,
 			searchSpaceId: data.search_space_id,
 			title: data.title,
+			highlightLines: hasLineAnchor
+				? { start: data.cited_start_line as number, end: data.cited_end_line as number }
+				: null,
+			forceSourceView: hasLineAnchor,
 		});
 	};
 
@@ -110,6 +122,7 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
 						</p>
 					</div>
 					<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
+						{citedLineLabel && <span>{citedLineLabel}</span>}
 						{totalChunks > 0 && <span>{totalChunks} chunks</span>}
 						{!isLoading && !error && data && (
 							<Button
@@ -172,7 +185,9 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
 												Chunk #{chunk.id}
 											</span>
 											{isCited && (
-												<span className="text-[11px] font-semibold text-primary">Cited chunk</span>
+												<span className="text-[11px] font-semibold text-primary">
+													{citedLineLabel ? `Cited chunk · ${citedLineLabel}` : "Cited chunk"}
+												</span>
 											)}
 										</div>
 										<div className="text-sm">

From b73a31f88974f8bcbe30d377053669a264332c90 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 33/47] feat: source editor reveals and highlights lines

---
 .../components/editor/source-code-editor.tsx  | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/editor/source-code-editor.tsx b/surfsense_web/components/editor/source-code-editor.tsx
index 9102dffe9..0277cde85 100644
--- a/surfsense_web/components/editor/source-code-editor.tsx
+++ b/surfsense_web/components/editor/source-code-editor.tsx
@@ -2,7 +2,7 @@
 
 import dynamic from "next/dynamic";
 import { useTheme } from "next-themes";
-import { useEffect, useRef } from "react";
+import { useCallback, useEffect, useRef } from "react";
 import { Spinner } from "@/components/ui/spinner";
 
 const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
@@ -17,6 +17,8 @@ interface SourceCodeEditorProps {
 	readOnly?: boolean;
 	fontSize?: number;
 	onSave?: () => Promise<void> | void;
+	/** 1-based inclusive line range to reveal and highlight (e.g. a citation). */
+	highlightLines?: { start: number; end: number } | null;
 }
 
 export function SourceCodeEditor({
@@ -27,10 +29,40 @@ export function SourceCodeEditor({
 	readOnly = false,
 	fontSize = 12,
 	onSave,
+	highlightLines = null,
 }: SourceCodeEditorProps) {
 	const { resolvedTheme } = useTheme();
 	const onSaveRef = useRef(onSave);
 	const monacoRef = useRef<any>(null);
+	const editorRef = useRef<any>(null);
+	const decorationsRef = useRef<any>(null);
+	const highlightLinesRef = useRef(highlightLines);
+	highlightLinesRef.current = highlightLines;
+
+	const applyHighlight = useCallback(() => {
+		const editor = editorRef.current;
+		const monaco = monacoRef.current;
+		if (!editor || !monaco) return;
+		if (decorationsRef.current) {
+			decorationsRef.current.clear();
+			decorationsRef.current = null;
+		}
+		const range = highlightLinesRef.current;
+		if (!range) return;
+		const start = Math.max(1, Math.floor(range.start));
+		const end = Math.max(start, Math.floor(range.end));
+		decorationsRef.current = editor.createDecorationsCollection([
+			{
+				range: new monaco.Range(start, 1, end, 1),
+				options: { isWholeLine: true, className: "citation-line-highlight" },
+			},
+		]);
+		editor.revealLinesInCenter(start, end);
+	}, []);
+
+	useEffect(() => {
+		applyHighlight();
+	}, [applyHighlight, highlightLines?.start, highlightLines?.end]);
 	const normalizedModelPath = (() => {
 		const raw = (path || "local-file.txt").trim();
 		const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
@@ -104,7 +136,10 @@ export function SourceCodeEditor({
 				}}
 				onMount={(editor, monaco) => {
 					monacoRef.current = monaco;
+					editorRef.current = editor;
 					applySidebarTheme(monaco);
+					// Defer one frame so the model is laid out before revealing.
+					requestAnimationFrame(() => applyHighlight());
 					if (!isManualSaveEnabled) return;
 					editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
 						void onSaveRef.current?.();

From c551b34d93296594586de2486eb68cf4fcf9fc54 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 34/47] style: add cited line highlight class

---
 surfsense_web/app/globals.css | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css
index 3cdb34bff..6950fd284 100644
--- a/surfsense_web/app/globals.css
+++ b/surfsense_web/app/globals.css
@@ -270,6 +270,12 @@ button {
 	contain-intrinsic-size: 0 40px;
 }
 
+/* Monaco whole-line highlight for a cited source span (Phase E). */
+.citation-line-highlight {
+	background-color: color-mix(in srgb, var(--primary) 16%, transparent);
+	box-shadow: inset 2px 0 0 0 var(--primary);
+}
+
 @source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
 @source "../node_modules/streamdown/dist/*.js";
 @source "../node_modules/@streamdown/code/dist/*.js";

From 049c70dc91614382904e3a858ce45a509686b8c5 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH 35/47] feat: open citations in source view at lines

---
 .../components/editor-panel/editor-panel.tsx          | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx
index 75283c81f..962fce707 100644
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@@ -149,6 +149,8 @@ export function EditorPanelContent({
 	searchSpaceId,
 	title,
 	onClose,
+	highlightLines = null,
+	forceSourceView = false,
 }: {
 	kind?: "document" | "local_file" | "memory";
 	documentId?: number;
@@ -157,6 +159,8 @@ export function EditorPanelContent({
 	searchSpaceId?: number;
 	title: string | null;
 	onClose?: () => void;
+	highlightLines?: { start: number; end: number } | null;
+	forceSourceView?: boolean;
 }) {
 	const electronAPI = useElectronAPI();
 	const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
@@ -205,7 +209,7 @@ export function EditorPanelContent({
 	const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
 	const viewerMode: ViewerMode = isMemoryMode
 		? "plate"
-		: editorDoc?.viewer_mode === "monaco" || isLargeDocument
+		: editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView
 			? "monaco"
 			: "plate";
 
@@ -828,6 +832,7 @@ export function EditorPanelContent({
 								value={editorDoc.source_markdown}
 								readOnly
 								onChange={() => {}}
+								highlightLines={highlightLines}
 							/>
 						</div>
 					</div>
@@ -918,6 +923,8 @@ function DesktopEditorPanel() {
 				searchSpaceId={panelState.searchSpaceId ?? undefined}
 				title={panelState.title}
 				onClose={closePanel}
+				highlightLines={panelState.highlightLines}
+				forceSourceView={panelState.forceSourceView}
 			/>
 		</div>
 	);
@@ -957,6 +964,8 @@ function MobileEditorDrawer() {
 						memoryScope={panelState.memoryScope ?? undefined}
 						searchSpaceId={panelState.searchSpaceId ?? undefined}
 						title={panelState.title}
+						highlightLines={panelState.highlightLines}
+						forceSourceView={panelState.forceSourceView}
 					/>
 				</div>
 			</DrawerContent>

From 1741fdc9c8d692a07a76acd18b2933b6f8a81bc6 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:43:21 +0200
Subject: [PATCH 36/47] feat: numbered-read preamble and matched line ranges

---
 .../filesystem/backends/numbered_document.py  | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py
new file mode 100644
index 000000000..ced77096f
--- /dev/null
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py
@@ -0,0 +1,73 @@
+"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
+
+The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
+``source_markdown`` makes those line numbers line up exactly with the chunk
+char spans and the editor highlight. This module renders the small header the
+agent sees above that body: document identity plus the matched line ranges to
+seek to, and a concrete reminder of the line-citation token shape.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from app.utils.text_spans import char_span_to_line_range
+
+
+def _format_range(start: int, end: int) -> str:
+    return f"{start}" if start == end else f"{start}-{end}"
+
+
+def compute_matched_line_ranges(
+    source_markdown: str,
+    chunks: Iterable[tuple[int, int | None, int | None]],
+    matched_chunk_ids: set[int],
+) -> list[tuple[int, int]]:
+    """Map matched chunks to sorted, de-duplicated 1-based line ranges.
+
+    ``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
+    spans (legacy rows) are skipped — they have no resolvable location.
+    """
+    ranges: set[tuple[int, int]] = set()
+    for chunk_id, start_char, end_char in chunks:
+        if chunk_id not in matched_chunk_ids:
+            continue
+        if start_char is None or end_char is None:
+            continue
+        ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
+    return sorted(ranges)
+
+
+def build_read_preamble(
+    *,
+    document_id: int,
+    document_type: str,
+    title: str,
+    url: str,
+    matched_line_ranges: list[tuple[int, int]],
+) -> str:
+    """Render the metadata header shown above a numbered ``source_markdown`` body.
+
+    ``matched_line_ranges`` are 1-based inclusive line ranges (already derived
+    from chunk char spans) to point the agent at the relevant lines.
+    """
+    lines = [
+        "<document_metadata>",
+        f"  <document_id>{document_id}</document_id>",
+        f"  <document_type>{document_type}</document_type>",
+        f"  <title><![CDATA[{title}]]></title>",
+        f"  <url><![CDATA[{url}]]></url>",
+    ]
+    if matched_line_ranges:
+        ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
+        lines.append(f"  <matched_lines>{ranges}</matched_lines>")
+    lines.append("</document_metadata>")
+    lines.append(
+        f"Cite lines from this document as [citation:d{document_id}#L<start>-<end>] "
+        "using the line numbers shown below."
+    )
+    lines.append("")
+    return "\n".join(lines)
+
+
+__all__ = ["build_read_preamble", "compute_matched_line_ranges"]

From 691685dd162892aad0edca290b112e3a8de031e1 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:43:21 +0200
Subject: [PATCH 37/47] test: cover read preamble and matched line ranges

---
 .../unit/middleware/test_numbered_document.py | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/middleware/test_numbered_document.py

diff --git a/surfsense_backend/tests/unit/middleware/test_numbered_document.py b/surfsense_backend/tests/unit/middleware/test_numbered_document.py
new file mode 100644
index 000000000..955c619b5
--- /dev/null
+++ b/surfsense_backend/tests/unit/middleware/test_numbered_document.py
@@ -0,0 +1,92 @@
+"""Unit tests for the numbered-document read preamble."""
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
+    build_read_preamble,
+    compute_matched_line_ranges,
+)
+
+pytestmark = pytest.mark.unit
+
+
+_BODY = "alpha\nbravo\ncharlie\ndelta"
+
+
+class TestComputeMatchedLineRanges:
+    def test_maps_matched_chunk_spans_to_line_ranges(self):
+        chunks = [(1, 0, 12), (2, 12, len(_BODY))]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {2})
+        assert ranges == [(3, 4)]
+
+    def test_includes_only_matched_chunks(self):
+        chunks = [(1, 0, 5), (2, 6, 11)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1})
+        assert ranges == [(1, 1)]
+
+    def test_skips_chunks_without_spans(self):
+        chunks = [(1, None, None)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1})
+        assert ranges == []
+
+    def test_sorted_and_deduplicated(self):
+        chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
+        assert ranges == [(1, 1), (3, 4)]
+
+
+class TestBuildReadPreamble:
+    def test_contains_document_metadata(self):
+        preamble = build_read_preamble(
+            document_id=42,
+            document_type="FILE",
+            title="Test Doc",
+            url="https://example.com",
+            matched_line_ranges=[],
+        )
+        assert "<document_id>42</document_id>" in preamble
+        assert "<document_type>FILE</document_type>" in preamble
+        assert "Test Doc" in preamble
+        assert "https://example.com" in preamble
+
+    def test_citation_hint_uses_document_id(self):
+        preamble = build_read_preamble(
+            document_id=42,
+            document_type="FILE",
+            title="Test Doc",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert "[citation:d42#L" in preamble
+
+    def test_lists_matched_line_ranges(self):
+        preamble = build_read_preamble(
+            document_id=7,
+            document_type="NOTE",
+            title="Notes",
+            url="",
+            matched_line_ranges=[(12, 18), (40, 40)],
+        )
+        assert "<matched_lines>" in preamble
+        assert "12-18" in preamble
+        assert "40" in preamble
+
+    def test_omits_matched_lines_block_when_empty(self):
+        preamble = build_read_preamble(
+            document_id=7,
+            document_type="NOTE",
+            title="Notes",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert "<matched_lines>" not in preamble
+
+    def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
+        preamble = build_read_preamble(
+            document_id=1,
+            document_type="FILE",
+            title="t",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert preamble.endswith("\n")

From 141801f1ccd5d42caf826506f6f5dcd666334e68 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:32:45 +0200
Subject: [PATCH 38/47] docs: clarify web/kb/legacy citation channels

---
 .../system_prompt/prompts/citations/on.md     | 60 ++++++++++++-------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
index 2abd95d5a..8e67615d0 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@@ -1,42 +1,58 @@
 <citations>
-Citations reach the answer through two channels. Use whichever applies — and
-never invent ids you didn't see. Citation ids are resolved by exact-match
-lookup; a wrong id silently breaks the link, so when in doubt, omit.
+Citations reach the answer through three channels. Use whichever applies, and
+never invent ids you didn't see: ids are matched exactly, so a wrong one
+silently breaks the link — when in doubt, omit. Always write a citation as
+plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
+parentheses.
 
-### Channel A — chunk blocks injected this turn
+### Channel A — web_search chunk blocks injected this turn
 When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
-turn:
+turn, the chunk `id` is the result's URL:
 
-1. For each factual statement taken from those chunks, add
-   `[citation:chunk_id]` using the **exact** id from a visible
-   `<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
-   do not retype from memory.
-2. `<document_id>` is the parent doc id, **not** a citation source —
-   only ids inside `<chunk id='…'>` count.
-3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
+1. For each factual statement taken from a chunk, add `[citation:<url>]`
+   using the **exact** id from a visible `<chunk id='…'>` tag. Copy the
+   URL verbatim; do not retype it from memory.
+2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
    each id copied individually).
-4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
-5. Plain brackets only — no markdown links, no footnote numbering.
+3. Never invent, normalise, or guess at a URL; if unsure, omit.
 
 ### Channel B — citations relayed by a `task` specialist
-A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
-the specialist already attached to its prose. The specialist saw the
-underlying `<chunk id='…'>` blocks; you didn't. So:
+A `task(...)` tool message may contain `[citation:…]` markers the
+specialist already attached to its prose — line citations
+(`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The
+specialist read the underlying document and tied each marker to a
+passage; you didn't. So:
 
 1. **Preserve those markers verbatim** in your final answer — do not
    reformat, renumber, drop, or wrap them in markdown links. When you
    paraphrase a specialist sentence, copy the marker character-for-
-   character; do not regenerate the id from memory (LLMs reliably
-   corrupt nearby digits).
+   character; do not regenerate it from memory (LLMs reliably corrupt
+   nearby digits).
 2. Keep each marker attached to the sentence the specialist attached
    it to.
 3. Do **not** add new `[citation:…]` markers of your own to a
    specialist's prose; if a fact has no marker, the specialist
-   couldn't tie it to a chunk and neither can you.
+   couldn't tie it to a source and neither can you.
 4. When a specialist returns JSON, the citation markers live inside
    the prose-bearing fields (e.g. a summary or excerpt). Pull them
    along with the surrounding sentence when you quote.
 
-If neither channel surfaces citation markers this turn, do not fabricate
-them.
+### Channel C — your knowledge base (search hits and `read_file`)
+Knowledge-base facts are cited by line range using the document id:
+`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
+
+1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
+   matched passage. When that passage supports your point, copy the token
+   verbatim — that is the entire citation.
+2. When you `read_file` a `/documents/...` path, its header gives the
+   `<document_id>` and an optional `<matched_lines>` pointer, and the body is
+   shown with line numbers; cite the lines you actually used. Use `read_file`
+   when you need more context than a search passage shows.
+3. Copy document ids and line numbers exactly as shown — never estimate,
+   shift, or invent them.
+4. Older documents without a numbered body instead show `<chunk id='N'>`
+   blocks; cite those with `[citation:N]`, copying the id exactly.
+
+If none of these channels surfaces a citable source this turn, do not
+fabricate citations.
 </citations>

From 3c63a7bcd3428caeea475c5708a9ec94f1fdc3ec Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:32:45 +0200
Subject: [PATCH 39/47] docs: kb specialist cites numbered or legacy chunk form

---
 .../knowledge_base/system_prompt_cloud.md     | 44 ++++++-------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
index c4e36fc73..f377db311 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
@@ -35,42 +35,24 @@ Map outcomes to your `status`:
 
 You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
 
-## Chunk citations in your prose
+## Citations in your prose
 
-When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
+`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
 
-### Where chunk ids live in `read_file` output
+**Numbered body (default).** A `<document_metadata>` header gives the `<document_id>` and an optional `<matched_lines>` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
 
-A KB document's XML has three numeric attributes — only **one** is a citation source:
-
-```
-<document>
-<document_metadata>
-  <document_id>42</document_id>          ← NOT a citation. Parent doc id; ignore for citations.
-  ...
-</document_metadata>
-<chunk_index>
-  <entry chunk_id="128" lines="14-22"/>  ← Index hint; the same id also appears below.
-  <entry chunk_id="129" lines="23-30" matched="true"/>
-</chunk_index>
-<document_content>
-  <chunk id='128'><![CDATA[…]]></chunk>  ← This is the citation source.
-  <chunk id='129'><![CDATA[…]]></chunk>
-</document_content>
-</document>
-```
+**Legacy chunk blocks (older docs without a stored body).** The response is XML with `<chunk id='N'>` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `<chunk id='…'>` tag.
 
 ### Rules
 
-- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
-- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
-- Never cite `<document_id>` — that's the parent doc, not a chunk.
-- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
+- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory.
+- Never cite `<document_id>` on its own — it identifies the document, not a passage. In the numbered form it is only the `d<document_id>` prefix of a line citation.
+- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick.
 - Prefer **fewer accurate citations** over many speculative ones.
-- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
+- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`.
 - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
-- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
-- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
+- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite.
+- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters.
 
 ## Examples
 
@@ -89,7 +71,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
       "path": "/documents/meetings/2026-05-11-meeting.md",
       "matched_candidates": null,
       "content_excerpt": null,
-      "chunk_ids": null
+      "citations": null
     },
     "next_step": null,
     "missing_fields": null,
@@ -121,7 +103,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
         { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
       ],
       "content_excerpt": null,
-      "chunk_ids": null
+      "citations": null
     },
     "next_step": "Ask the user which design doc to update.",
     "missing_fields": ["path"],
@@ -142,7 +124,7 @@ Return **only** one JSON object (no markdown or prose outside it):
     "path": string | null,
     "matched_candidates": [ { "id": string, "label": string } ] | null,
     "content_excerpt": string | null,
-    "chunk_ids": string[] | null
+    "citations": string[] | null
   },
   "next_step": string | null,
   "missing_fields": string[] | null,

From 30ca0e1ef5d8767cd66efa053dd7b49ee4f9b1a2 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:32:45 +0200
Subject: [PATCH 40/47] docs: readonly kb specialist cites line or chunk form

---
 .../system_prompt_readonly_cloud.md           | 42 +++++--------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
index c7813e71d..f0aa8403e 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
@@ -28,41 +28,21 @@ Reply in plain prose:
 - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
 - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
 
-## Chunk citations
+## Citations
 
-When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
+`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
 
-### Where chunk ids live in `read_file` output
-
-A KB document's XML has three numeric attributes — only **one** is a citation source:
-
-```
-<document>
-<document_metadata>
-  <document_id>42</document_id>          ← NOT a citation. Parent doc id; ignore for citations.
-  ...
-</document_metadata>
-<chunk_index>
-  <entry chunk_id="128" lines="14-22"/>  ← Index hint; the same id also appears below.
-  <entry chunk_id="129" lines="23-30" matched="true"/>
-</chunk_index>
-<document_content>
-  <chunk id='128'><![CDATA[…]]></chunk>  ← This is the citation source.
-  <chunk id='129'><![CDATA[…]]></chunk>
-</document_content>
-</document>
-```
+- **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>`, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
+- **Legacy chunk blocks (older docs).** XML with `<chunk id='N'>` blocks. Cite the chunk a claim came from as `[citation:N]`.
 
 ### Rules
 
-- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
-- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
-- Never cite `<document_id>` — that's the parent doc, not a chunk.
-- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
-- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
-- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
+- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation.
+- Never cite `<document_id>` on its own — in the numbered form it is only the `d<document_id>` prefix of a line citation.
+- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones.
+- Multiple passages supporting the same point → comma-separated and copied individually.
 - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
-- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
-- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
+- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite.
+- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference.
 
-Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
+Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].`

From fc17b9becdf5c5cf09da184f5de1c188b26ea56d Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:32:45 +0200
Subject: [PATCH 41/47] docs: rename evidence.chunk_ids to citations in desktop
 kb prompt

---
 .../builtins/knowledge_base/system_prompt_desktop.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
index 25dafa3df..72a921c4f 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
@@ -33,11 +33,11 @@ Map outcomes to your `status`:
 - Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
 - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
 
-You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
+You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
 
-## Chunk citations in your prose
+## Citations in your prose
 
-In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
+In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
 
 ## Examples
 
@@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
       "path": "/notes/meetings/2026-05-11-meeting.md",
       "matched_candidates": null,
       "content_excerpt": null,
-      "chunk_ids": null
+      "citations": null
     },
     "next_step": null,
     "missing_fields": null,
@@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
         { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
       ],
       "content_excerpt": null,
-      "chunk_ids": null
+      "citations": null
     },
     "next_step": "Ask the user which design doc to update.",
     "missing_fields": ["path"],
@@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
     "path": string | null,
     "matched_candidates": [ { "id": string, "label": string } ] | null,
     "content_excerpt": string | null,
-    "chunk_ids": string[] | null
+    "citations": string[] | null
   },
   "next_step": string | null,
   "missing_fields": string[] | null,

From 188ae053aca5b5d79fa06e51999860697c336948 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:37:41 +0200
Subject: [PATCH 42/47] feat: serve numbered source_markdown reads with
 citation preamble

---
 .../filesystem/backends/kb_postgres.py        | 110 +++++++++++-------
 .../filesystem/tools/edit_file/index.py       |   2 +-
 .../filesystem/tools/move_file/helpers.py     |   2 +-
 .../filesystem/tools/read_file/index.py       |   4 +-
 .../middleware/filesystem/tools/rm/helpers.py |   2 +-
 .../test_b_filesystem_rm_rmdir_cloud.py       |   2 +-
 6 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
index e13196537..e704d5599 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
@@ -45,6 +45,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
     build_document_xml,
 )
+from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
+    build_read_preamble,
+    compute_matched_line_ranges,
+)
 from app.agents.chat.runtime.path_resolver import (
     DOCUMENTS_ROOT,
     build_path_index,
@@ -64,6 +68,12 @@ def _basename(path: str) -> str:
     return path.rsplit("/", 1)[-1]
 
 
+def _metadata_url(metadata: dict[str, Any]) -> str:
+    return (
+        metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
+    )
+
+
 def _is_under(child: str, parent: str) -> bool:
     """Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
     if parent == "/":
@@ -460,8 +470,11 @@ class KBPostgresBackend(BackendProtocol):
         loaded = await self._load_file_data(file_path)
         if loaded is None:
             return f"Error: File '{file_path}' not found"
-        file_data, _ = loaded
-        return format_read_response(file_data, offset, limit)
+        file_data, _, preamble = loaded
+        body = format_read_response(file_data, offset, limit)
+        if preamble and offset == 0:
+            return preamble + body
+        return body
 
     def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str:  # type: ignore[override]
         return asyncio.run(self.aread(file_path, offset, limit))
@@ -469,12 +482,14 @@ class KBPostgresBackend(BackendProtocol):
     async def _load_file_data(
         self,
         path: str,
-    ) -> tuple[dict[str, Any], int | None] | None:
+    ) -> tuple[dict[str, Any], int | None, str | None] | None:
         """Lazy-load a virtual KB document into a deepagents ``FileData``.
 
-        Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
-        to any known document. ``doc_id`` is ``None`` for the synthetic
-        anonymous document so the caller doesn't track it as a DB-backed file.
+        Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
+        doesn't map to any known document. ``doc_id`` is ``None`` for the
+        synthetic anonymous document. ``preamble`` is the metadata header to
+        show above a numbered ``source_markdown`` body (``None`` for the legacy
+        chunk-reconstructed XML reads used when a document has no body).
         """
         anon = self._kb_anon_doc()
         if anon and str(anon.get("path") or "") == path:
@@ -492,7 +507,7 @@ class KBPostgresBackend(BackendProtocol):
             }
             xml = build_document_xml(doc_payload, matched_chunk_ids=set())
             file_data = create_file_data(xml)
-            return file_data, None
+            return file_data, None, None
 
         if not path.startswith(DOCUMENTS_ROOT):
             return None
@@ -505,41 +520,58 @@ class KBPostgresBackend(BackendProtocol):
             )
             if document is None:
                 return None
-            chunk_rows = await session.execute(
-                select(Chunk.id, Chunk.content)
-                .where(Chunk.document_id == document.id)
-                .order_by(Chunk.position, Chunk.id)
-            )
-            chunks = [
-                {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
-            ]
-
-        doc_payload = {
-            "document_id": document.id,
-            "chunks": chunks,
-            "matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
-            "document": {
-                "id": document.id,
-                "title": document.title,
-                "document_type": (
-                    document.document_type.value
-                    if getattr(document, "document_type", None) is not None
-                    else "UNKNOWN"
-                ),
-                "metadata": dict(document.document_metadata or {}),
-            },
-            "source": (
+            source_markdown = document.source_markdown or ""
+            document_type = (
                 document.document_type.value
                 if getattr(document, "document_type", None) is not None
                 else "UNKNOWN"
-            ),
+            )
+            metadata = dict(document.document_metadata or {})
+            chunk_rows = await session.execute(
+                select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
+                .where(Chunk.document_id == document.id)
+                .order_by(Chunk.position, Chunk.id)
+            )
+            chunk_records = chunk_rows.all()
+            document_id = document.id
+            document_title = document.title
+
+        matched = self._matched_chunk_ids(document_id)
+
+        # Canonical read: serve the verbatim body with cat -n line numbers that
+        # line up with chunk char spans, so the agent cites real source lines.
+        if source_markdown:
+            ranges = compute_matched_line_ranges(
+                source_markdown,
+                [(r.id, r.start_char, r.end_char) for r in chunk_records],
+                matched,
+            )
+            preamble = build_read_preamble(
+                document_id=document_id,
+                document_type=document_type,
+                title=document_title,
+                url=_metadata_url(metadata),
+                matched_line_ranges=ranges,
+            )
+            return create_file_data(source_markdown), document_id, preamble
+
+        # Legacy fallback: no canonical body, reconstruct from chunks as XML.
+        doc_payload = {
+            "document_id": document_id,
+            "chunks": [
+                {"chunk_id": r.id, "content": r.content} for r in chunk_records
+            ],
+            "matched_chunk_ids": list(matched),
+            "document": {
+                "id": document_id,
+                "title": document_title,
+                "document_type": document_type,
+                "metadata": metadata,
+            },
+            "source": document_type,
         }
-        xml = build_document_xml(
-            doc_payload,
-            matched_chunk_ids=self._matched_chunk_ids(document.id),
-        )
-        file_data = create_file_data(xml)
-        return file_data, document.id
+        xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
+        return create_file_data(xml), document_id, None
 
     # ------------------------------------------------------------------ writes
 
@@ -571,7 +603,7 @@ class KBPostgresBackend(BackendProtocol):
             loaded = await self._load_file_data(file_path)
             if loaded is None:
                 return EditResult(error=f"Error: File '{file_path}' not found")
-            file_data, _ = loaded
+            file_data, _, _ = loaded
 
         content = file_data_to_string(file_data)
         result = perform_string_replacement(
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
index 775469531..036617d8d 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
@@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
             loaded = await backend._load_file_data(validated)
             if loaded is None:
                 return f"Error: File '{validated}' not found"
-            _, doc_id_to_attach = loaded
+            _, doc_id_to_attach, _ = loaded
 
         res: EditResult = await backend.aedit(
             validated, old_string, new_string, replace_all=replace_all
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
index ded4701f9..be61ca94f 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
@@ -75,7 +75,7 @@ async def cloud_move_file(
         loaded = await backend._load_file_data(source)
         if loaded is None:
             return f"Error: source '{source}' not found."
-        source_file_data, loaded_doc_id = loaded
+        source_file_data, loaded_doc_id, _ = loaded
         if source_doc_id is None:
             source_doc_id = loaded_doc_id
 
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
index 5c20619d6..6cbbe6ae5 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
@@ -58,8 +58,10 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
             loaded = await backend._load_file_data(validated)
             if loaded is None:
                 return f"Error: File '{validated}' not found"
-            file_data, doc_id = loaded
+            file_data, doc_id, preamble = loaded
             rendered = format_read_response(file_data, offset, limit)
+            if preamble and offset == 0:
+                rendered = preamble + rendered
             update: dict[str, Any] = {
                 "files": {validated: file_data},
                 "messages": [
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
index e2e445d08..020200cbd 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
@@ -74,7 +74,7 @@ async def cloud_rm(
         loaded = await backend._load_file_data(validated)
         if loaded is None:
             return f"Error: file '{validated}' not found."
-        _, resolved_doc_id = loaded
+        _, resolved_doc_id, _ = loaded
 
     files_update: dict[str, Any] = {validated: None}
     update: dict[str, Any] = {
diff --git a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
index 898ec3765..27653c544 100644
--- a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
+++ b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
@@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
     def __init__(self, *, children=None, file_data=None) -> None:
         self.als_info = AsyncMock(return_value=children or [])
         self._load_file_data = AsyncMock(
-            return_value=(file_data, 17) if file_data is not None else None
+            return_value=(file_data, 17, None) if file_data is not None else None
         )
 
 

From 73dd4e8e3a3e8026880cf738cef6008d584ce5a3 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:37:41 +0200
Subject: [PATCH 43/47] feat: embed line-citation tokens in search hits

---
 .../main_agent/tools/search_knowledge_base.py | 41 +++++++++++--------
 .../tools/test_search_knowledge_base.py       | 18 +++++---
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
index ad47816f9..0696dc92e 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
@@ -112,20 +112,25 @@ async def _resolve_doc_context(
     return paths, bodies
 
 
-def _line_label(chunk: dict[str, Any], body: str | None) -> str:
-    """``[lines X-Y]`` for a span-bearing chunk, or '' when spans are absent."""
+def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
+    """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
     start = chunk.get("start_char")
     end = chunk.get("end_char")
-    if not body or not isinstance(start, int) or not isinstance(end, int):
+    if (
+        not body
+        or not isinstance(doc_id, int)
+        or not isinstance(start, int)
+        or not isinstance(end, int)
+    ):
         return ""
     start_line, end_line = char_span_to_line_range(body, start, end)
-    if start_line == end_line:
-        return f"[line {start_line}]"
-    return f"[lines {start_line}-{end_line}]"
+    return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
 
 
-def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
-    """Render one matched chunk as an indented, line-annotated passage."""
+def _render_passage(
+    chunk: dict[str, Any], body: str | None, doc_id: int | None
+) -> str | None:
+    """Render one matched chunk as an indented passage tagged with its token."""
     content = (chunk.get("content") or "").strip()
     if not content:
         return None
@@ -133,12 +138,14 @@ def _render_passage(chunk: dict[str, Any], body: str | None) -> str | None:
     if len(content) > _PER_DOC_SNIPPET_CHARS:
         snippet += " ..."
     indented = snippet.replace("\n", "\n   ")
-    label = _line_label(chunk, body)
-    head = f"\n   {label}" if label else ""
+    token = _citation_token(chunk, body, doc_id)
+    head = f"\n   {token}" if token else ""
     return f"{head}\n   {indented}"
 
 
-def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
+def _matched_passages(
+    doc: dict[str, Any], body: str | None, doc_id: int | None
+) -> str:
     """Render the RRF-matched chunks; '' when none can be rendered."""
     by_id = {
         c.get("chunk_id"): c
@@ -150,7 +157,7 @@ def _matched_passages(doc: dict[str, Any], body: str | None) -> str:
         chunk = by_id.get(chunk_id)
         if chunk is None:
             continue
-        passage = _render_passage(chunk, body)
+        passage = _render_passage(chunk, body, doc_id)
         if passage:
             rendered.append(passage)
     return "".join(rendered)
@@ -194,11 +201,12 @@ def _format_hits(
         path = paths.get(doc_id) if isinstance(doc_id, int) else None
         body = bodies.get(doc_id) if isinstance(doc_id, int) else None
 
-        header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
+        id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
+        header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
             f"\n   path: {path}" if path else ""
         )
 
-        passages = _matched_passages(doc, body)
+        passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
         entry = header + (passages or _fallback_snippet(doc))
         if total + len(entry) > _MAX_TOTAL_CHARS:
             lines.append("\n<!-- additional matches truncated to fit context -->")
@@ -207,8 +215,9 @@ def _format_hits(
         total += len(entry)
 
     lines.append(
-        "\n\nTo read a full document, delegate to the knowledge_base specialist "
-        "with `task`, referencing the path above."
+        "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
+        "verbatim. To quote more context or read the full document, delegate to "
+        "the knowledge_base specialist with `task` using the path above."
     )
     lines.append("\n</knowledge_base_results>")
     return "".join(lines)
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
index eadfcd30d..e068792b1 100644
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
@@ -51,20 +51,28 @@ def test_renders_matched_passage_not_top_of_doc() -> None:
     assert "Intro paragraph." not in out
 
 
-def test_includes_line_range_when_spans_present() -> None:
+def test_emits_copyable_line_citation_token_when_spans_present() -> None:
     out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
-    # "Matched passage here." sits on line 3 of the body.
-    assert "line 3" in out
+    # "Matched passage here." sits on line 3 of the body; the hit must surface
+    # a ready-to-copy token so the agent can cite without a separate read.
+    assert "[citation:d7#L3-3]" in out
 
 
-def test_omits_line_range_when_spans_absent() -> None:
+def test_header_includes_document_id() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "id=7" in out
+
+
+def test_omits_citation_token_when_spans_absent() -> None:
     hit = _hit()
     for chunk in hit["chunks"]:
         chunk["start_char"] = None
         chunk["end_char"] = None
     out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
     assert "Matched passage here." in out
-    assert "[line" not in out
+    # No concrete, copyable token for this document without spans (the closing
+    # instruction's placeholder template doesn't count).
+    assert "[citation:d7#L" not in out
 
 
 def test_falls_back_to_content_when_no_matched_ids() -> None:

From 5f341bdd2fa35b0184f5522f6fc2d5543b945f28 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 17:37:41 +0200
Subject: [PATCH 44/47] feat: parse and render kb line citations

---
 .../assistant-ui/inline-citation.tsx          | 46 +++++++++++++++++++
 .../citations/citation-renderer.tsx           | 12 ++++-
 .../editor/plugins/citation-kit.tsx           | 35 +++++++++++---
 .../lib/citations/citation-parser.ts          | 18 ++++++--
 4 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx
index 59a10739c..28f5212ae 100644
--- a/surfsense_web/components/assistant-ui/inline-citation.tsx
+++ b/surfsense_web/components/assistant-ui/inline-citation.tsx
@@ -2,9 +2,11 @@
 
 import { useSetAtom } from "jotai";
 import { FileText } from "lucide-react";
+import { useParams } from "next/navigation";
 import type { FC } from "react";
 import { useId, useState } from "react";
 import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
+import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
 import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
 import { Citation } from "@/components/tool-ui/citation";
@@ -108,6 +110,50 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
 	);
 };
 
+interface LineCitationProps {
+	documentId: number;
+	startLine: number;
+	endLine: number;
+}
+
+/**
+ * Inline citation for a knowledge-base document line range
+ * (`[citation:d<documentId>#L<start>-<end>]`). Clicking opens the document in
+ * the editor's read-only source view, scrolled to and highlighting the cited
+ * lines — the same anchor the citation panel uses for chunk citations.
+ */
+export const LineCitation: FC<LineCitationProps> = ({ documentId, startLine, endLine }) => {
+	const openEditorPanel = useSetAtom(openEditorPanelAtom);
+	const params = useParams();
+	const searchSpaceId = Number(params?.search_space_id);
+
+	const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`;
+
+	const handleClick = () => {
+		if (!Number.isFinite(searchSpaceId)) return;
+		openEditorPanel({
+			documentId,
+			searchSpaceId,
+			highlightLines: { start: startLine, end: endLine },
+			forceSourceView: true,
+		});
+	};
+
+	return (
+		<Button
+			type="button"
+			variant="ghost"
+			onClick={handleClick}
+			className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
+			title={`View cited lines ${startLine}–${endLine}`}
+			aria-label={`View cited document lines ${startLine} to ${endLine}`}
+		>
+			<FileText className="size-3" />
+			{label}
+		</Button>
+	);
+};
+
 import { tryGetHostname } from "@/lib/url";
 
 interface UrlCitationProps {
diff --git a/surfsense_web/components/citations/citation-renderer.tsx b/surfsense_web/components/citations/citation-renderer.tsx
index f2de4b27d..b0ab13f84 100644
--- a/surfsense_web/components/citations/citation-renderer.tsx
+++ b/surfsense_web/components/citations/citation-renderer.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import type { ReactNode } from "react";
-import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
 import {
 	type CitationToken,
 	type CitationUrlMap,
@@ -21,6 +21,16 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R
 	if (token.kind === "url") {
 		return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
 	}
+	if (token.kind === "line") {
+		return (
+			<LineCitation
+				key={`citation-line-${token.documentId}-${token.startLine}-${ordinalKey}`}
+				documentId={token.documentId}
+				startLine={token.startLine}
+				endLine={token.endLine}
+			/>
+		);
+	}
 	return (
 		<InlineCitation
 			key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}
diff --git a/surfsense_web/components/editor/plugins/citation-kit.tsx b/surfsense_web/components/editor/plugins/citation-kit.tsx
index 97e8ec723..edba9a19e 100644
--- a/surfsense_web/components/editor/plugins/citation-kit.tsx
+++ b/surfsense_web/components/editor/plugins/citation-kit.tsx
@@ -3,9 +3,10 @@
 import { type Descendant, KEYS } from "platejs";
 import { createPlatePlugin, type PlateElementProps } from "platejs/react";
 import type { FC } from "react";
-import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
 import {
 	CITATION_REGEX,
+	type CitationToken,
 	type CitationUrlMap,
 	parseTextWithCitations,
 } from "@/lib/citations/citation-parser";
@@ -17,9 +18,12 @@ import {
  */
 export type CitationElementNode = {
 	type: "citation";
-	kind: "chunk" | "doc" | "url";
+	kind: "chunk" | "doc" | "url" | "line";
 	chunkId?: number;
 	url?: string;
+	documentId?: number;
+	startLine?: number;
+	endLine?: number;
 	/** Original literal token that produced this citation node. */
 	rawText: string;
 	children: [{ text: "" }];
@@ -33,11 +37,22 @@ const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
 	element,
 }) => {
 	const isUrl = element.kind === "url";
+	const isLine =
+		element.kind === "line" &&
+		element.documentId !== undefined &&
+		element.startLine !== undefined &&
+		element.endLine !== undefined;
 	return (
 		<span {...attributes} className="inline-flex align-baseline">
 			<span contentEditable={false}>
 				{isUrl && element.url ? (
 					<UrlCitation url={element.url} />
+				) : isLine ? (
+					<LineCitation
+						documentId={element.documentId as number}
+						startLine={element.startLine as number}
+						endLine={element.endLine as number}
+					/>
 				) : element.chunkId !== undefined ? (
 					<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
 				) : null}
@@ -97,10 +112,7 @@ function copyMarks(textNode: SlateText): Record<string, unknown> {
 	return marks;
 }
 
-function makeCitationElement(
-	rawText: string,
-	segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
-): CitationElementNode {
+function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode {
 	if (segment.kind === "url") {
 		return {
 			type: CITATION_TYPE,
@@ -110,6 +122,17 @@ function makeCitationElement(
 			children: [{ text: "" }],
 		};
 	}
+	if (segment.kind === "line") {
+		return {
+			type: CITATION_TYPE,
+			kind: "line",
+			documentId: segment.documentId,
+			startLine: segment.startLine,
+			endLine: segment.endLine,
+			rawText,
+			children: [{ text: "" }],
+		};
+	}
 	return {
 		type: CITATION_TYPE,
 		kind: segment.isDocsChunk ? "doc" : "chunk",
diff --git a/surfsense_web/lib/citations/citation-parser.ts b/surfsense_web/lib/citations/citation-parser.ts
index 533c644c2..0d320956f 100644
--- a/surfsense_web/lib/citations/citation-parser.ts
+++ b/surfsense_web/lib/citations/citation-parser.ts
@@ -18,12 +18,16 @@ import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
  * sometimes emit.
  */
 export const CITATION_REGEX =
-	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
+	/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|d\d+#L\d+-\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
+
+/** Matches the knowledge-base line-citation form `d<documentId>#L<start>-<end>`. */
+const LINE_CITATION_REGEX = /^d(\d+)#L(\d+)-(\d+)$/;
 
 /** A single parsed citation reference. */
 export type CitationToken =
 	| { kind: "url"; url: string }
-	| { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
+	| { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
+	| { kind: "line"; documentId: number; startLine: number; endLine: number };
 
 /** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
 export type ParsedSegment = string | CitationToken;
@@ -95,7 +99,15 @@ export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): Pa
 
 		const captured = match[1];
 
-		if (captured.startsWith("http://") || captured.startsWith("https://")) {
+		const lineMatch = LINE_CITATION_REGEX.exec(captured);
+		if (lineMatch) {
+			segments.push({
+				kind: "line",
+				documentId: Number.parseInt(lineMatch[1], 10),
+				startLine: Number.parseInt(lineMatch[2], 10),
+				endLine: Number.parseInt(lineMatch[3], 10),
+			});
+		} else if (captured.startsWith("http://") || captured.startsWith("https://")) {
 			segments.push({ kind: "url", url: captured.trim() });
 		} else if (captured.startsWith("urlcite")) {
 			const url = urlMap.get(captured);

From cfafed09bc76c5bcb6427998091b2c120a2a2185 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 19:12:14 +0200
Subject: [PATCH 45/47] fix: forward citation line anchor to editor panel and
 harden reveal

---
 .../components/editor/source-code-editor.tsx  | 33 ++++++++++++-------
 .../layout/ui/right-panel/RightPanel.tsx      |  2 ++
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/surfsense_web/components/editor/source-code-editor.tsx b/surfsense_web/components/editor/source-code-editor.tsx
index 0277cde85..4af4f2125 100644
--- a/surfsense_web/components/editor/source-code-editor.tsx
+++ b/surfsense_web/components/editor/source-code-editor.tsx
@@ -49,15 +49,20 @@ export function SourceCodeEditor({
 		}
 		const range = highlightLinesRef.current;
 		if (!range) return;
-		const start = Math.max(1, Math.floor(range.start));
-		const end = Math.max(start, Math.floor(range.end));
-		decorationsRef.current = editor.createDecorationsCollection([
-			{
-				range: new monaco.Range(start, 1, end, 1),
-				options: { isWholeLine: true, className: "citation-line-highlight" },
-			},
-		]);
-		editor.revealLinesInCenter(start, end);
+		const lineCount = editor.getModel()?.getLineCount() ?? range.end;
+		const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount);
+		const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount);
+		try {
+			decorationsRef.current = editor.createDecorationsCollection([
+				{
+					range: new monaco.Range(start, 1, end, 1),
+					options: { isWholeLine: true, className: "citation-line-highlight" },
+				},
+			]);
+		} catch {
+			// Decoration failure must not block the reveal below.
+		}
+		editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate);
 	}, []);
 
 	useEffect(() => {
@@ -138,8 +143,14 @@ export function SourceCodeEditor({
 					monacoRef.current = monaco;
 					editorRef.current = editor;
 					applySidebarTheme(monaco);
-					// Defer one frame so the model is laid out before revealing.
-					requestAnimationFrame(() => applyHighlight());
+					// Reveal now, then once more after the first layout settles:
+					// the panel slide-in animation means the editor often has no
+					// usable viewport height on the initial frame.
+					applyHighlight();
+					const layoutSub = editor.onDidLayoutChange(() => {
+						applyHighlight();
+						layoutSub.dispose();
+					});
 					if (!isManualSaveEnabled) return;
 					editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
 						void onSaveRef.current?.();
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index 5a7588979..bfad44dd8 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -308,6 +308,8 @@ export function RightPanel({
 							searchSpaceId={editorState.searchSpaceId ?? undefined}
 							title={editorState.title}
 							onClose={closeEditor}
+							highlightLines={editorState.highlightLines}
+							forceSourceView={editorState.forceSourceView}
 						/>
 					</div>
 				)}

From cfc3be5b1fcc19d7e857c23ec21ad3337ec226a0 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 19:22:19 +0200
Subject: [PATCH 46/47] fix: gate desktop right panel to prevent duplicate
 mobile editor

---
 .../components/layout/ui/right-panel/RightPanel.tsx         | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index bfad44dd8..6662d7830 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -12,6 +12,7 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
 import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
+import { useMediaQuery } from "@/hooks/use-media-query";
 import { cn } from "@/lib/utils";
 import { DocumentsSidebar } from "../sidebar";
 
@@ -196,6 +197,9 @@ export function RightPanel({
 	const citationState = useAtomValue(citationPanelAtom);
 	const closeCitation = useSetAtom(closeCitationPanelAtom);
 	const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
+	// Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without
+	// this guard both render together and two editors fight over one model.
+	const isDesktop = useMediaQuery("(min-width: 1024px)");
 
 	const documentsOpen = documentsPanel?.open ?? false;
 	const reportOpen = reportState.isOpen && !!reportState.reportId;
@@ -267,7 +271,7 @@ export function RightPanel({
 		<CollapseButton onClick={() => setCollapsed(true)} />
 	) : null;
 
-	if (!isVisible) return null;
+	if (!isVisible || !isDesktop) return null;
 
 	return (
 		<aside

From 31a14190e3547a4bc7f7db97b721fabbc62e9ff7 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 19:36:26 +0200
Subject: [PATCH 47/47] fix: update upload conftest mock to span-aware chunker

---
 .../tests/integration/document_upload/conftest.py          | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index bd889360f..f73c4eaaf 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -286,9 +286,12 @@ def _mock_external_apis(monkeypatch):
         "app.indexing_pipeline.cache.cached_indexing.embed_texts",
         MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
     )
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    chunk = "Test chunk content."
     monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text",
-        MagicMock(return_value=["Test chunk content."]),
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
     )