From f67c6607d60a27427fc715980e0a2b30c8cde37a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 19 Jun 2026 15:31:44 +0200 Subject: [PATCH] feat: by-chunk resolve derives cited line range --- .../app/routes/documents_routes.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 53f03a0ca..ea6b0d4fa 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -37,6 +37,7 @@ from app.schemas import ( from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher from app.users import current_active_user from app.utils.rbac import check_permission +from app.utils.text_spans import char_span_to_line_range try: asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) @@ -967,9 +968,12 @@ async def get_document_by_chunk_id( session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): - """ - Retrieves a document based on a chunk ID, including a window of chunks around the cited one. - Uses SQL-level pagination to avoid loading all chunks into memory. + """Resolve a chunk id to its document plus a window of surrounding chunks. + + Returns the cited chunk's 1-based line range (cited_start_line/ + cited_end_line) when char spans exist, so callers can anchor the citation + to exact source lines. Uses SQL-level pagination to avoid loading all + chunks into memory. """ try: from sqlalchemy import and_, func, or_ @@ -1033,6 +1037,17 @@ async def get_document_by_chunk_id( ) windowed_chunks = windowed_result.scalars().all() + cited_start_line: int | None = None + cited_end_line: int | None = None + if ( + chunk.start_char is not None + and chunk.end_char is not None + and document.source_markdown + ): + cited_start_line, cited_end_line = char_span_to_line_range( + document.source_markdown, chunk.start_char, chunk.end_char + ) + return DocumentWithChunksRead( id=document.id, title=document.title, @@ -1047,6 +1062,8 @@ async def get_document_by_chunk_id( chunks=windowed_chunks, total_chunks=total_chunks, chunk_start_index=start, + cited_start_line=cited_start_line, + cited_end_line=cited_end_line, ) except HTTPException: raise