Merge remote-tracking branch 'upstream/dev' into feat/api-key

2026-07-04 22:02:16 +02:00 · 2026-06-23 13:09:53 +05:30 · 2026-06-23 13:09:53 +05:30 · 3695e1d5c5
commit 3695e1d5c5
parent 96c1dd9d4f 1dc3fac81d
64 changed files with 1043 additions and 1852 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -38,7 +38,6 @@ from app.schemas import (
 from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
 from app.users import get_auth_context
 from app.utils.rbac import check_permission
-from app.utils.text_spans import char_span_to_line_range

 try:
    asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@ -977,12 +976,9 @@ async def get_document_by_chunk_id(
    session: AsyncSession = Depends(get_async_session),
    auth: AuthContext = Depends(get_auth_context),
 ):
-    """Resolve a chunk id to its document plus a window of surrounding chunks.
-
-    Returns the cited chunk's 1-based line range (cited_start_line/
-    cited_end_line) when char spans exist, so callers can anchor the citation
-    to exact source lines. Uses SQL-level pagination to avoid loading all
-    chunks into memory.
+    """
+    Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+    Uses SQL-level pagination to avoid loading all chunks into memory.
    """
    try:
        from sqlalchemy import and_, func, or_
@ -1046,17 +1042,6 @@ async def get_document_by_chunk_id(
        )
        windowed_chunks = windowed_result.scalars().all()

-        cited_start_line: int | None = None
-        cited_end_line: int | None = None
-        if (
-            chunk.start_char is not None
-            and chunk.end_char is not None
-            and document.source_markdown
-        ):
-            cited_start_line, cited_end_line = char_span_to_line_range(
-                document.source_markdown, chunk.start_char, chunk.end_char
-            )
-
        return DocumentWithChunksRead(
            id=document.id,
            title=document.title,
@ -1071,8 +1056,6 @@ async def get_document_by_chunk_id(
            chunks=windowed_chunks,
            total_chunks=total_chunks,
            chunk_start_index=start,
-            cited_start_line=cited_start_line,
-            cited_end_line=cited_end_line,
        )
    except HTTPException:
        raise
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@ -43,34 +43,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
 EDITOR_PLATE_MAX_LINES = 5000


-def _raise_no_canonical_body(document: Document) -> None:
-    """Translate a missing source_markdown into a status-aware HTTP error."""
-    doc_status = document.status or {}
-    state = (
-        doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
-    )
-
-    if state in ("pending", "processing"):
-        raise HTTPException(
-            status_code=409,
-            detail="This document is still being processed. Please wait a moment and try again.",
-        )
-    if state == "failed":
-        reason = (
-            doc_status.get("reason", "Unknown error")
-            if isinstance(doc_status, dict)
-            else "Unknown error"
-        )
-        raise HTTPException(
-            status_code=422,
-            detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
-        )
-    raise HTTPException(
-        status_code=400,
-        detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
-    )
-
-
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
 async def get_editor_content(
    search_space_id: int,
@ -82,9 +54,8 @@ async def get_editor_content(
    """
    Get document content for editing.

-    Returns source_markdown (the canonical body) for the Plate.js editor, with a
-    one-time migration from legacy blocknote_document. Never reconstructs the
-    body from chunks.
+    Returns source_markdown for the Plate.js editor.
+    Falls back to blocknote_document → markdown conversion, then chunk reconstruction.

    Requires DOCUMENTS_READ permission.
    """
@ -154,9 +125,52 @@ async def get_editor_content(
        await session.commit()
        return _build_response(empty_markdown)

-    # No canonical body. Chunks are an index artifact, never the source of
-    # truth, so surface the processing state instead of rebuilding from them.
-    _raise_no_canonical_body(document)
+    chunk_contents_result = await session.execute(
+        select(Chunk.content)
+        .filter(Chunk.document_id == document_id)
+        .order_by(Chunk.position, Chunk.id)
+    )
+    chunk_contents = chunk_contents_result.scalars().all()
+
+    if not chunk_contents:
+        doc_status = document.status or {}
+        state = (
+            doc_status.get("state", "ready")
+            if isinstance(doc_status, dict)
+            else "ready"
+        )
+        if state in ("pending", "processing"):
+            raise HTTPException(
+                status_code=409,
+                detail="This document is still being processed. Please wait a moment and try again.",
+            )
+        if state == "failed":
+            reason = (
+                doc_status.get("reason", "Unknown error")
+                if isinstance(doc_status, dict)
+                else "Unknown error"
+            )
+            raise HTTPException(
+                status_code=422,
+                detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
+            )
+        raise HTTPException(
+            status_code=400,
+            detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
+        )
+
+    markdown_content = "\n\n".join(chunk_contents)
+
+    if not markdown_content.strip():
+        raise HTTPException(
+            status_code=400,
+            detail="This document appears to be empty. Try re-uploading or editing it to add content.",
+        )
+
+    document.source_markdown = markdown_content
+    await session.commit()
+
+    return _build_response(markdown_content)


@router.get(
@ -170,9 +184,8 @@ async def download_document_markdown(
 ):
    user = auth.user
    """
-    Download the canonical document body as a .md file.
-
-    Serves source_markdown, migrating legacy blocknote_document when present.
+    Download the full document content as a .md file.
+    Reconstructs markdown from source_markdown or chunks.
    """
    await check_permission(
        session,
@ -198,6 +211,15 @@ async def download_document_markdown(
        from app.utils.blocknote_to_markdown import blocknote_to_markdown

        markdown = blocknote_to_markdown(document.blocknote_document)
+    if markdown is None:
+        chunk_contents_result = await session.execute(
+            select(Chunk.content)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.position, Chunk.id)
+        )
+        chunk_contents = chunk_contents_result.scalars().all()
+        if chunk_contents:
+            markdown = "\n\n".join(chunk_contents)

    if not markdown or not markdown.strip():
        raise HTTPException(
@ -340,6 +362,15 @@ async def export_document(
        from app.utils.blocknote_to_markdown import blocknote_to_markdown

        markdown_content = blocknote_to_markdown(document.blocknote_document)
+    if markdown_content is None:
+        chunk_contents_result = await session.execute(
+            select(Chunk.content)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.position, Chunk.id)
+        )
+        chunk_contents = chunk_contents_result.scalars().all()
+        if chunk_contents:
+            markdown_content = "\n\n".join(chunk_contents)

    if not markdown_content or not markdown_content.strip():
        raise HTTPException(status_code=400, detail="Document has no content to export")
--- a/surfsense_backend/app/routes/image_generation_routes.py
+++ b/surfsense_backend/app/routes/image_generation_routes.py
@ -214,7 +214,7 @@ async def _execute_image_generation(
        )

    # Store response
-    image_gen.response_data = (
+    response_dict = (
        response.model_dump() if hasattr(response, "model_dump") else dict(response)
    )
    if not image_gen.model and hasattr(response, "_hidden_params"):
@ -222,6 +222,20 @@ async def _execute_image_generation(
        if isinstance(hidden, dict) and hidden.get("model"):
            image_gen.model = hidden["model"]

+    # Fix relative URLs in response data (for the serving endpoint)
+    from urllib.parse import urlparse
+    images = response_dict.get("data", [])
+    provider_base_url = resolved_kwargs.get("api_base")
+    for image in images:
+        if image.get("url"):
+            raw_url: str = image["url"]
+            if raw_url.startswith("/") and provider_base_url:
+                parsed = urlparse(provider_base_url)
+                origin = f"{parsed.scheme}://{parsed.netloc}"
+                image["url"] = f"{origin}{raw_url}"
+
+    image_gen.response_data = response_dict
+

 # =============================================================================
 # Image Generation Execution + Results CRUD