From f67c6607d60a27427fc715980e0a2b30c8cde37a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 19 Jun 2026 15:31:44 +0200
Subject: [PATCH] feat: by-chunk resolve derives cited line range

---
 .../app/routes/documents_routes.py            | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 53f03a0ca..ea6b0d4fa 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -37,6 +37,7 @@ from app.schemas import (
 from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
 from app.users import current_active_user
 from app.utils.rbac import check_permission
+from app.utils.text_spans import char_span_to_line_range
 
 try:
     asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -967,9 +968,12 @@ async def get_document_by_chunk_id(
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
-    """
-    Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
-    Uses SQL-level pagination to avoid loading all chunks into memory.
+    """Resolve a chunk id to its document plus a window of surrounding chunks.
+
+    Returns the cited chunk's 1-based line range (cited_start_line/
+    cited_end_line) when char spans exist, so callers can anchor the citation
+    to exact source lines. Uses SQL-level pagination to avoid loading all
+    chunks into memory.
     """
     try:
         from sqlalchemy import and_, func, or_
@@ -1033,6 +1037,17 @@ async def get_document_by_chunk_id(
         )
         windowed_chunks = windowed_result.scalars().all()
 
+        cited_start_line: int | None = None
+        cited_end_line: int | None = None
+        if (
+            chunk.start_char is not None
+            and chunk.end_char is not None
+            and document.source_markdown
+        ):
+            cited_start_line, cited_end_line = char_span_to_line_range(
+                document.source_markdown, chunk.start_char, chunk.end_char
+            )
+
         return DocumentWithChunksRead(
             id=document.id,
             title=document.title,
@@ -1047,6 +1062,8 @@ async def get_document_by_chunk_id(
             chunks=windowed_chunks,
             total_chunks=total_chunks,
             chunk_start_index=start,
+            cited_start_line=cited_start_line,
+            cited_end_line=cited_end_line,
         )
     except HTTPException:
         raise