refactor: streamline document upload limits and enhance handling of mentioned documents

- Updated maximum file size limit to 500 MB per file. - Removed restrictions on the number of files per upload and total upload size. - Enhanced handling of user-mentioning documents in the knowledge base search middleware. - Improved document reading and processing logic to accommodate new features and optimizations.
2026-04-26 17:26:23 +02:00 · 2026-04-02 19:39:10 -07:00 · 2026-04-02 19:39:10 -07:00 · 62e698d8aa
commit 62e698d8aa
parent 6727266107
33 changed files with 2889 additions and 2443 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -1,7 +1,7 @@
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio

-from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
+from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
@ -17,6 +17,7 @@ from app.db import (
    get_async_session,
 )
 from app.schemas import (
+    ChunkRead,
    DocumentRead,
    DocumentsCreate,
    DocumentStatusBatchResponse,
@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"

 router = APIRouter()

-MAX_FILES_PER_UPLOAD = 10
-MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024  # 50 MB per file
-MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024  # 200 MB total
+MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024  # 500 MB per file


@router.post("/documents")
@ -156,13 +155,6 @@ async def create_documents_file_upload(
        if not files:
            raise HTTPException(status_code=400, detail="No files provided")

-        if len(files) > MAX_FILES_PER_UPLOAD:
-            raise HTTPException(
-                status_code=413,
-                detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
-            )
-
-        total_size = 0
        for file in files:
            file_size = file.size or 0
            if file_size > MAX_FILE_SIZE_BYTES:
@ -171,14 +163,6 @@ async def create_documents_file_upload(
                    detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
                    f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
                )
-            total_size += file_size
-
-        if total_size > MAX_TOTAL_SIZE_BYTES:
-            raise HTTPException(
-                status_code=413,
-                detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
-                f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
-            )

        # ===== Read all files concurrently to avoid blocking the event loop =====
        async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
@ -206,16 +190,6 @@ async def create_documents_file_upload(

        saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))

-        actual_total_size = sum(size for _, _, size in saved_files)
-        if actual_total_size > MAX_TOTAL_SIZE_BYTES:
-            for temp_path, _, _ in saved_files:
-                os.unlink(temp_path)
-            raise HTTPException(
-                status_code=413,
-                detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
-                f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
-            )
-
        # ===== PHASE 1: Create pending documents for all files =====
        created_documents: list[Document] = []
        files_to_process: list[tuple[Document, str, str]] = []
@ -451,13 +425,15 @@ async def read_documents(
                    reason=doc.status.get("reason"),
                )

+            raw_content = doc.content or ""
            api_documents.append(
                DocumentRead(
                    id=doc.id,
                    title=doc.title,
                    document_type=doc.document_type,
                    document_metadata=doc.document_metadata,
-                    content=doc.content,
+                    content="",
+                    content_preview=raw_content[:300],
                    content_hash=doc.content_hash,
                    unique_identifier_hash=doc.unique_identifier_hash,
                    created_at=doc.created_at,
@ -609,13 +585,15 @@ async def search_documents(
                    reason=doc.status.get("reason"),
                )

+            raw_content = doc.content or ""
            api_documents.append(
                DocumentRead(
                    id=doc.id,
                    title=doc.title,
                    document_type=doc.document_type,
                    document_metadata=doc.document_metadata,
-                    content=doc.content,
+                    content="",
+                    content_preview=raw_content[:300],
                    content_hash=doc.content_hash,
                    unique_identifier_hash=doc.unique_identifier_hash,
                    created_at=doc.created_at,
@ -884,16 +862,19 @@ async def get_document_type_counts(
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
 async def get_document_by_chunk_id(
    chunk_id: int,
+    chunk_window: int = Query(
+        5, ge=0, description="Number of chunks before/after the cited chunk to include"
+    ),
    session: AsyncSession = Depends(get_async_session),
    user: User = Depends(current_active_user),
 ):
    """
-    Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
-    Requires DOCUMENTS_READ permission for the search space.
-    The document's embedding and chunk embeddings are excluded from the response.
+    Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+    Uses SQL-level pagination to avoid loading all chunks into memory.
    """
    try:
-        # First, get the chunk and verify it exists
+        from sqlalchemy import and_, func, or_
+
        chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
        chunk = chunk_result.scalars().first()

@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
                status_code=404, detail=f"Chunk with id {chunk_id} not found"
            )

-        # Get the associated document
        document_result = await session.execute(
-            select(Document)
-            .options(selectinload(Document.chunks))
-            .filter(Document.id == chunk.document_id)
+            select(Document).filter(Document.id == chunk.document_id)
        )
        document = document_result.scalars().first()

@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
                detail="Document not found",
            )

-        # Check permission for the search space
        await check_permission(
            session,
            user,
@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
            "You don't have permission to read documents in this search space",
        )

-        # Sort chunks by creation time
-        sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
+        total_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(Chunk.document_id == document.id)
+        )
+        total_chunks = total_result.scalar() or 0
+
+        cited_idx_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(
+                Chunk.document_id == document.id,
+                or_(
+                    Chunk.created_at < chunk.created_at,
+                    and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
+                ),
+            )
+        )
+        cited_idx = cited_idx_result.scalar() or 0
+
+        start = max(0, cited_idx - chunk_window)
+        end = min(total_chunks, cited_idx + chunk_window + 1)
+
+        windowed_result = await session.execute(
+            select(Chunk)
+            .filter(Chunk.document_id == document.id)
+            .order_by(Chunk.created_at, Chunk.id)
+            .offset(start)
+            .limit(end - start)
+        )
+        windowed_chunks = windowed_result.scalars().all()

-        # Return the document with its chunks
        return DocumentWithChunksRead(
            id=document.id,
            title=document.title,
@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
            created_at=document.created_at,
            updated_at=document.updated_at,
            search_space_id=document.search_space_id,
-            chunks=sorted_chunks,
+            chunks=windowed_chunks,
+            total_chunks=total_chunks,
+            chunk_start_index=start,
        )
    except HTTPException:
        raise
@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
        ) from e


+@router.get(
+    "/documents/{document_id}/chunks",
+    response_model=PaginatedResponse[ChunkRead],
+)
+async def get_document_chunks_paginated(
+    document_id: int,
+    page: int = Query(0, ge=0),
+    page_size: int = Query(20, ge=1, le=100),
+    start_offset: int | None = Query(
+        None, ge=0, description="Direct offset; overrides page * page_size"
+    ),
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Paginated chunk loading for a document.
+    Supports both page-based and offset-based access.
+    """
+    try:
+        from sqlalchemy import func
+
+        doc_result = await session.execute(
+            select(Document).filter(Document.id == document_id)
+        )
+        document = doc_result.scalars().first()
+
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+
+        await check_permission(
+            session,
+            user,
+            document.search_space_id,
+            Permission.DOCUMENTS_READ.value,
+            "You don't have permission to read documents in this search space",
+        )
+
+        total_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(Chunk.document_id == document_id)
+        )
+        total = total_result.scalar() or 0
+
+        offset = start_offset if start_offset is not None else page * page_size
+        chunks_result = await session.execute(
+            select(Chunk)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.created_at, Chunk.id)
+            .offset(offset)
+            .limit(page_size)
+        )
+        chunks = chunks_result.scalars().all()
+
+        return PaginatedResponse(
+            items=chunks,
+            total=total,
+            page=offset // page_size if page_size else page,
+            page_size=page_size,
+            has_more=(offset + len(chunks)) < total,
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch chunks: {e!s}"
+        ) from e
+
+
@router.get("/documents/{document_id}", response_model=DocumentRead)
 async def read_document(
    document_id: int,
@ -980,13 +1056,14 @@ async def read_document(
            "You don't have permission to read documents in this search space",
        )

-        # Convert database object to API-friendly format
+        raw_content = document.content or ""
        return DocumentRead(
            id=document.id,
            title=document.title,
            document_type=document.document_type,
            document_metadata=document.document_metadata,
-            content=document.content,
+            content=raw_content,
+            content_preview=raw_content[:300],
            content_hash=document.content_hash,
            unique_identifier_hash=document.unique_identifier_hash,
            created_at=document.created_at,