diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py
index 8f0d7b5d3..ff74952a9 100644
--- a/surfsense_backend/alembic/versions/116_create_zero_publication.py
+++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py
@@ -42,9 +42,7 @@ def upgrade() -> None:
     if not exists:
         table_list = ", ".join(TABLES)
         conn.execute(
-            sa.text(
-                f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}"
-            )
+            sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}")
         )
 
 
diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
new file mode 100644
index 000000000..3c2d34c76
--- /dev/null
+++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
@@ -0,0 +1,102 @@
+"""optimize zero_publication with column lists
+
+Recreates the zero_publication using column lists for the documents
+table so that large text columns (content, source_markdown,
+blocknote_document, etc.) are excluded from WAL replication.
+This prevents RangeError: Invalid string length in zero-cache's
+change-streamer when documents have very large content.
+
+Also resets REPLICA IDENTITY to DEFAULT on tables that had it set
+to FULL for the old Electric SQL setup (migration 66/75/76).
+With DEFAULT (primary-key) identity, column-list publications
+only need to include the PK — not every column.
+
+After running this migration you MUST:
+  1. Stop zero-cache
+  2. Delete / reset the zero-cache data volume
+  3. Restart zero-cache  (it will do a fresh initial sync)
+
+Revision ID: 117
+Revises: 116
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "117"
+down_revision: str | None = "116"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+TABLES_WITH_FULL_IDENTITY = [
+    "documents",
+    "notifications",
+    "search_source_connectors",
+    "new_chat_messages",
+    "chat_comments",
+    "chat_session_state",
+]
+
+DOCUMENT_COLS = [
+    "id",
+    "title",
+    "document_type",
+    "search_space_id",
+    "folder_id",
+    "created_by_id",
+    "status",
+    "created_at",
+    "updated_at",
+]
+
+PUBLICATION_DDL_FULL = f"""\
+CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE
+  notifications, documents, folders,
+  search_source_connectors, new_chat_messages,
+  chat_comments, chat_session_state
+"""
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    for tbl in TABLES_WITH_FULL_IDENTITY:
+        conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
+
+    conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+    has_zero_ver = conn.execute(
+        sa.text(
+            "SELECT 1 FROM information_schema.columns "
+            "WHERE table_name = 'documents' AND column_name = '_0_version'"
+        )
+    ).fetchone()
+
+    cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
+    col_list = ", ".join(cols)
+
+    conn.execute(
+        sa.text(
+            f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+            f"notifications, "
+            f"documents ({col_list}), "
+            f"folders, "
+            f"search_source_connectors, "
+            f"new_chat_messages, "
+            f"chat_comments, "
+            f"chat_session_state"
+        )
+    )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+    conn.execute(sa.text(PUBLICATION_DDL_FULL))
+    for tbl in TABLES_WITH_FULL_IDENTITY:
+        conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL'))
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index ccc06f272..fc1e80d28 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -159,6 +159,7 @@ async def create_surfsense_deep_agent(
     additional_tools: Sequence[BaseTool] | None = None,
     firecrawl_api_key: str | None = None,
     thread_visibility: ChatVisibility | None = None,
+    mentioned_document_ids: list[int] | None = None,
 ):
     """
     Create a SurfSense deep agent with configurable tools and prompts.
@@ -451,6 +452,7 @@ async def create_surfsense_deep_agent(
             search_space_id=search_space_id,
             available_connectors=available_connectors,
             available_document_types=available_document_types,
+            mentioned_document_ids=mentioned_document_ids,
         ),
         SurfSenseFilesystemMiddleware(
             search_space_id=search_space_id,
diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
index 41b24f88b..d7697ef15 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@@ -66,6 +66,16 @@ the `<chunk_index>`, identify chunks marked `matched="true"`, then use
 those sections instead of reading the entire file sequentially.
 
 Use `<chunk id='...'>` values as citation IDs in your answers.
+
+## User-Mentioned Documents
+
+When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`,
+the user **explicitly selected** that document. These files are your highest-
+priority sources:
+1. **Always read them thoroughly** — scan the full `<chunk_index>`, then read
+   all major sections, not just matched chunks.
+2. **Prefer their content** over other search results when answering.
+3. **Cite from them first** whenever applicable.
 """
 
 # =============================================================================
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 3728f229c..7b0dd2f71 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -28,7 +28,13 @@ from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range
-from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session
+from app.db import (
+    NATIVE_TO_LEGACY_DOCTYPE,
+    Chunk,
+    Document,
+    Folder,
+    shielded_async_session,
+)
 from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
 from app.utils.document_converters import embed_texts
 from app.utils.perf import get_perf_logger
@@ -430,21 +436,36 @@ async def _get_folder_paths(
 def _build_synthetic_ls(
     existing_files: dict[str, Any] | None,
     new_files: dict[str, Any],
+    *,
+    mentioned_paths: set[str] | None = None,
 ) -> tuple[AIMessage, ToolMessage]:
     """Build a synthetic ls("/documents") tool-call + result for the LLM context.
 
-    Paths are listed with *new* (rank-ordered) files first, then existing files
-    that were already in state from prior turns.
+    Mentioned files are listed first.  A separate header tells the LLM which
+    files the user explicitly selected; the path list itself stays clean so
+    paths can be passed directly to ``read_file`` without stripping tags.
     """
+    _mentioned = mentioned_paths or set()
     merged: dict[str, Any] = {**(existing_files or {}), **new_files}
     doc_paths = [
         p for p, v in merged.items() if p.startswith("/documents/") and v is not None
     ]
 
     new_set = set(new_files)
-    new_paths = [p for p in doc_paths if p in new_set]
+    mentioned_list = [p for p in doc_paths if p in _mentioned]
+    new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned]
     old_paths = [p for p in doc_paths if p not in new_set]
-    ordered = new_paths + old_paths
+    ordered = mentioned_list + new_non_mentioned + old_paths
+
+    parts: list[str] = []
+    if mentioned_list:
+        parts.append(
+            "USER-MENTIONED documents (read these thoroughly before answering):"
+        )
+        for p in mentioned_list:
+            parts.append(f"  {p}")
+        parts.append("")
+    parts.append(str(ordered) if ordered else "No documents found.")
 
     tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
     ai_msg = AIMessage(
@@ -452,7 +473,7 @@ def _build_synthetic_ls(
         tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
     )
     tool_msg = ToolMessage(
-        content=str(ordered) if ordered else "No documents found.",
+        content="\n".join(parts),
         tool_call_id=tool_call_id,
     )
     return ai_msg, tool_msg
@@ -524,12 +545,92 @@ async def search_knowledge_base(
     return results[:top_k]
 
 
+async def fetch_mentioned_documents(
+    *,
+    document_ids: list[int],
+    search_space_id: int,
+) -> list[dict[str, Any]]:
+    """Fetch explicitly mentioned documents with *all* their chunks.
+
+    Returns the same dict structure as ``search_knowledge_base`` so results
+    can be merged directly into ``build_scoped_filesystem``.  Unlike search
+    results, every chunk is included (no top-K limiting) and none are marked
+    as ``matched`` since the entire document is relevant by virtue of the
+    user's explicit mention.
+    """
+    if not document_ids:
+        return []
+
+    async with shielded_async_session() as session:
+        doc_result = await session.execute(
+            select(Document).where(
+                Document.id.in_(document_ids),
+                Document.search_space_id == search_space_id,
+            )
+        )
+        docs = {doc.id: doc for doc in doc_result.scalars().all()}
+
+        if not docs:
+            return []
+
+        chunk_result = await session.execute(
+            select(Chunk.id, Chunk.content, Chunk.document_id)
+            .where(Chunk.document_id.in_(list(docs.keys())))
+            .order_by(Chunk.document_id, Chunk.id)
+        )
+        chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
+        for row in chunk_result.all():
+            if row.document_id in chunks_by_doc:
+                chunks_by_doc[row.document_id].append(
+                    {"chunk_id": row.id, "content": row.content}
+                )
+
+    results: list[dict[str, Any]] = []
+    for doc_id in document_ids:
+        doc = docs.get(doc_id)
+        if doc is None:
+            continue
+        metadata = doc.document_metadata or {}
+        results.append(
+            {
+                "document_id": doc.id,
+                "content": "",
+                "score": 1.0,
+                "chunks": chunks_by_doc.get(doc.id, []),
+                "matched_chunk_ids": [],
+                "document": {
+                    "id": doc.id,
+                    "title": doc.title,
+                    "document_type": (
+                        doc.document_type.value
+                        if getattr(doc, "document_type", None)
+                        else None
+                    ),
+                    "metadata": metadata,
+                },
+                "source": (
+                    doc.document_type.value
+                    if getattr(doc, "document_type", None)
+                    else None
+                ),
+                "_user_mentioned": True,
+            }
+        )
+    return results
+
+
 async def build_scoped_filesystem(
     *,
     documents: Sequence[dict[str, Any]],
     search_space_id: int,
-) -> dict[str, dict[str, str]]:
-    """Build a StateBackend-compatible files dict from search results."""
+) -> tuple[dict[str, dict[str, str]], dict[int, str]]:
+    """Build a StateBackend-compatible files dict from search results.
+
+    Returns ``(files, doc_id_to_path)`` so callers can reliably map a
+    document id back to its filesystem path without guessing by title.
+    Paths are collision-proof: when two documents resolve to the same
+    path the doc-id is appended to disambiguate.
+    """
     async with shielded_async_session() as session:
         folder_paths = await _get_folder_paths(session, search_space_id)
         doc_ids = [
@@ -551,6 +652,7 @@ async def build_scoped_filesystem(
             }
 
     files: dict[str, dict[str, str]] = {}
+    doc_id_to_path: dict[int, str] = {}
     for document in documents:
         doc_meta = document.get("document") or {}
         title = str(doc_meta.get("title") or "untitled")
@@ -559,6 +661,9 @@ async def build_scoped_filesystem(
         base_folder = folder_paths.get(folder_id, "/documents")
         file_name = _safe_filename(title)
         path = f"{base_folder}/{file_name}"
+        if path in files:
+            stem = file_name.removesuffix(".xml")
+            path = f"{base_folder}/{stem} ({doc_id}).xml"
         matched_ids = set(document.get("matched_chunk_ids") or [])
         xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids)
         files[path] = {
@@ -567,7 +672,9 @@ async def build_scoped_filesystem(
             "created_at": "",
             "modified_at": "",
         }
-    return files
+        if isinstance(doc_id, int):
+            doc_id_to_path[doc_id] = path
+    return files, doc_id_to_path
 
 
 class KnowledgeBaseSearchMiddleware(AgentMiddleware):  # type: ignore[type-arg]
@@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         available_connectors: list[str] | None = None,
         available_document_types: list[str] | None = None,
         top_k: int = 10,
+        mentioned_document_ids: list[int] | None = None,
     ) -> None:
         self.llm = llm
         self.search_space_id = search_space_id
         self.available_connectors = available_connectors
         self.available_document_types = available_document_types
         self.top_k = top_k
+        self.mentioned_document_ids = mentioned_document_ids or []
 
     async def _plan_search_inputs(
         self,
@@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware):  # type: ignore[type-arg]
             user_text=user_text,
         )
 
+        # --- 1. Fetch mentioned documents (user-selected, all chunks) ---
+        mentioned_results: list[dict[str, Any]] = []
+        if self.mentioned_document_ids:
+            mentioned_results = await fetch_mentioned_documents(
+                document_ids=self.mentioned_document_ids,
+                search_space_id=self.search_space_id,
+            )
+            # Clear after first turn so they are not re-fetched on subsequent
+            # messages within the same agent instance.
+            self.mentioned_document_ids = []
+
+        # --- 2. Run KB hybrid search ---
         search_results = await search_knowledge_base(
             query=planned_query,
             search_space_id=self.search_space_id,
@@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware):  # type: ignore[type-arg]
             start_date=start_date,
             end_date=end_date,
         )
-        new_files = await build_scoped_filesystem(
-            documents=search_results,
+
+        # --- 3. Merge: mentioned first, then search (dedup by doc id) ---
+        seen_doc_ids: set[int] = set()
+        merged: list[dict[str, Any]] = []
+        for doc in mentioned_results:
+            doc_id = (doc.get("document") or {}).get("id")
+            if doc_id is not None:
+                seen_doc_ids.add(doc_id)
+            merged.append(doc)
+        for doc in search_results:
+            doc_id = (doc.get("document") or {}).get("id")
+            if doc_id is not None and doc_id in seen_doc_ids:
+                continue
+            merged.append(doc)
+
+        # --- 4. Build scoped filesystem ---
+        new_files, doc_id_to_path = await build_scoped_filesystem(
+            documents=merged,
             search_space_id=self.search_space_id,
         )
 
-        ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files)
+        # Identify which paths belong to user-mentioned documents using
+        # the authoritative doc_id -> path mapping (no title guessing).
+        mentioned_doc_ids = {
+            (d.get("document") or {}).get("id") for d in mentioned_results
+        }
+        mentioned_paths = {
+            doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path
+        }
+
+        ai_msg, tool_msg = _build_synthetic_ls(
+            existing_files,
+            new_files,
+            mentioned_paths=mentioned_paths,
+        )
 
         if t0 is not None:
             _perf_log.info(
-                "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d",
+                "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r "
+                "mentioned=%d new_files=%d total=%d",
                 asyncio.get_event_loop().time() - t0,
                 user_text[:80],
                 planned_query[:120],
+                len(mentioned_results),
                 len(new_files),
                 len(new_files) + len(existing_files or {}),
             )
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 6e69218f1..f53c81bb6 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1,7 +1,7 @@
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
 
-from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
+from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
@@ -17,6 +17,7 @@ from app.db import (
     get_async_session,
 )
 from app.schemas import (
+    ChunkRead,
     DocumentRead,
     DocumentsCreate,
     DocumentStatusBatchResponse,
@@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
 
 router = APIRouter()
 
-MAX_FILES_PER_UPLOAD = 10
-MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024  # 50 MB per file
-MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024  # 200 MB total
+MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024  # 500 MB per file
 
 
 @router.post("/documents")
@@ -156,13 +155,6 @@ async def create_documents_file_upload(
         if not files:
             raise HTTPException(status_code=400, detail="No files provided")
 
-        if len(files) > MAX_FILES_PER_UPLOAD:
-            raise HTTPException(
-                status_code=413,
-                detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
-            )
-
-        total_size = 0
         for file in files:
             file_size = file.size or 0
             if file_size > MAX_FILE_SIZE_BYTES:
@@ -171,14 +163,6 @@ async def create_documents_file_upload(
                     detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
                     f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
                 )
-            total_size += file_size
-
-        if total_size > MAX_TOTAL_SIZE_BYTES:
-            raise HTTPException(
-                status_code=413,
-                detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
-                f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
-            )
 
         # ===== Read all files concurrently to avoid blocking the event loop =====
         async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
@@ -206,16 +190,6 @@ async def create_documents_file_upload(
 
         saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))
 
-        actual_total_size = sum(size for _, _, size in saved_files)
-        if actual_total_size > MAX_TOTAL_SIZE_BYTES:
-            for temp_path, _, _ in saved_files:
-                os.unlink(temp_path)
-            raise HTTPException(
-                status_code=413,
-                detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
-                f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
-            )
-
         # ===== PHASE 1: Create pending documents for all files =====
         created_documents: list[Document] = []
         files_to_process: list[tuple[Document, str, str]] = []
@@ -451,13 +425,15 @@ async def read_documents(
                     reason=doc.status.get("reason"),
                 )
 
+            raw_content = doc.content or ""
             api_documents.append(
                 DocumentRead(
                     id=doc.id,
                     title=doc.title,
                     document_type=doc.document_type,
                     document_metadata=doc.document_metadata,
-                    content=doc.content,
+                    content="",
+                    content_preview=raw_content[:300],
                     content_hash=doc.content_hash,
                     unique_identifier_hash=doc.unique_identifier_hash,
                     created_at=doc.created_at,
@@ -609,13 +585,15 @@ async def search_documents(
                     reason=doc.status.get("reason"),
                 )
 
+            raw_content = doc.content or ""
             api_documents.append(
                 DocumentRead(
                     id=doc.id,
                     title=doc.title,
                     document_type=doc.document_type,
                     document_metadata=doc.document_metadata,
-                    content=doc.content,
+                    content="",
+                    content_preview=raw_content[:300],
                     content_hash=doc.content_hash,
                     unique_identifier_hash=doc.unique_identifier_hash,
                     created_at=doc.created_at,
@@ -884,16 +862,19 @@ async def get_document_type_counts(
 @router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
 async def get_document_by_chunk_id(
     chunk_id: int,
+    chunk_window: int = Query(
+        5, ge=0, description="Number of chunks before/after the cited chunk to include"
+    ),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
     """
-    Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
-    Requires DOCUMENTS_READ permission for the search space.
-    The document's embedding and chunk embeddings are excluded from the response.
+    Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+    Uses SQL-level pagination to avoid loading all chunks into memory.
     """
     try:
-        # First, get the chunk and verify it exists
+        from sqlalchemy import and_, func, or_
+
         chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
         chunk = chunk_result.scalars().first()
 
@@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
                 status_code=404, detail=f"Chunk with id {chunk_id} not found"
             )
 
-        # Get the associated document
         document_result = await session.execute(
-            select(Document)
-            .options(selectinload(Document.chunks))
-            .filter(Document.id == chunk.document_id)
+            select(Document).filter(Document.id == chunk.document_id)
         )
         document = document_result.scalars().first()
 
@@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
                 detail="Document not found",
             )
 
-        # Check permission for the search space
         await check_permission(
             session,
             user,
@@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
             "You don't have permission to read documents in this search space",
         )
 
-        # Sort chunks by creation time
-        sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
+        total_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(Chunk.document_id == document.id)
+        )
+        total_chunks = total_result.scalar() or 0
+
+        cited_idx_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(
+                Chunk.document_id == document.id,
+                or_(
+                    Chunk.created_at < chunk.created_at,
+                    and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
+                ),
+            )
+        )
+        cited_idx = cited_idx_result.scalar() or 0
+
+        start = max(0, cited_idx - chunk_window)
+        end = min(total_chunks, cited_idx + chunk_window + 1)
+
+        windowed_result = await session.execute(
+            select(Chunk)
+            .filter(Chunk.document_id == document.id)
+            .order_by(Chunk.created_at, Chunk.id)
+            .offset(start)
+            .limit(end - start)
+        )
+        windowed_chunks = windowed_result.scalars().all()
 
-        # Return the document with its chunks
         return DocumentWithChunksRead(
             id=document.id,
             title=document.title,
@@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
             created_at=document.created_at,
             updated_at=document.updated_at,
             search_space_id=document.search_space_id,
-            chunks=sorted_chunks,
+            chunks=windowed_chunks,
+            total_chunks=total_chunks,
+            chunk_start_index=start,
         )
     except HTTPException:
         raise
@@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
         ) from e
 
 
+@router.get(
+    "/documents/{document_id}/chunks",
+    response_model=PaginatedResponse[ChunkRead],
+)
+async def get_document_chunks_paginated(
+    document_id: int,
+    page: int = Query(0, ge=0),
+    page_size: int = Query(20, ge=1, le=100),
+    start_offset: int | None = Query(
+        None, ge=0, description="Direct offset; overrides page * page_size"
+    ),
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Paginated chunk loading for a document.
+    Supports both page-based and offset-based access.
+    """
+    try:
+        from sqlalchemy import func
+
+        doc_result = await session.execute(
+            select(Document).filter(Document.id == document_id)
+        )
+        document = doc_result.scalars().first()
+
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+
+        await check_permission(
+            session,
+            user,
+            document.search_space_id,
+            Permission.DOCUMENTS_READ.value,
+            "You don't have permission to read documents in this search space",
+        )
+
+        total_result = await session.execute(
+            select(func.count())
+            .select_from(Chunk)
+            .filter(Chunk.document_id == document_id)
+        )
+        total = total_result.scalar() or 0
+
+        offset = start_offset if start_offset is not None else page * page_size
+        chunks_result = await session.execute(
+            select(Chunk)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.created_at, Chunk.id)
+            .offset(offset)
+            .limit(page_size)
+        )
+        chunks = chunks_result.scalars().all()
+
+        return PaginatedResponse(
+            items=chunks,
+            total=total,
+            page=offset // page_size if page_size else page,
+            page_size=page_size,
+            has_more=(offset + len(chunks)) < total,
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch chunks: {e!s}"
+        ) from e
+
+
 @router.get("/documents/{document_id}", response_model=DocumentRead)
 async def read_document(
     document_id: int,
@@ -980,13 +1056,14 @@ async def read_document(
             "You don't have permission to read documents in this search space",
         )
 
-        # Convert database object to API-friendly format
+        raw_content = document.content or ""
         return DocumentRead(
             id=document.id,
             title=document.title,
             document_type=document.document_type,
             document_metadata=document.document_metadata,
-            content=document.content,
+            content=raw_content,
+            content_preview=raw_content[:300],
             content_hash=document.content_hash,
             unique_identifier_hash=document.unique_identifier_hash,
             created_at=document.created_at,
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index f54f18def..09a35c619 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -15,11 +15,10 @@ import pypandoc
 import typst
 from fastapi import APIRouter, Depends, HTTPException, Query
 from fastapi.responses import StreamingResponse
-from sqlalchemy import select
+from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import selectinload
 
-from app.db import Document, DocumentType, Permission, User, get_async_session
+from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session
 from app.routes.reports_routes import (
     _FILE_EXTENSIONS,
     _MEDIA_TYPES,
@@ -44,6 +43,9 @@ router = APIRouter()
 async def get_editor_content(
     search_space_id: int,
     document_id: int,
+    max_length: int | None = Query(
+        None, description="Truncate source_markdown to this many characters"
+    ),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -65,9 +67,7 @@ async def get_editor_content(
     )
 
     result = await session.execute(
-        select(Document)
-        .options(selectinload(Document.chunks))
-        .filter(
+        select(Document).filter(
             Document.id == document_id,
             Document.search_space_id == search_space_id,
         )
@@ -77,62 +77,63 @@ async def get_editor_content(
     if not document:
         raise HTTPException(status_code=404, detail="Document not found")
 
-    # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings)
-    if document.source_markdown is not None:
+    count_result = await session.execute(
+        select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id)
+    )
+    chunk_count = count_result.scalar() or 0
+
+    def _build_response(md: str) -> dict:
+        size_bytes = len(md.encode("utf-8"))
+        truncated = False
+        output_md = md
+        if max_length is not None and size_bytes > max_length:
+            output_md = md[:max_length]
+            truncated = True
         return {
             "document_id": document.id,
             "title": document.title,
             "document_type": document.document_type.value,
-            "source_markdown": document.source_markdown,
+            "source_markdown": output_md,
+            "content_size_bytes": size_bytes,
+            "chunk_count": chunk_count,
+            "truncated": truncated,
             "updated_at": document.updated_at.isoformat()
             if document.updated_at
             else None,
         }
 
-    # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps)
+    if document.source_markdown is not None:
+        return _build_response(document.source_markdown)
+
     if document.blocknote_document:
         from app.utils.blocknote_to_markdown import blocknote_to_markdown
 
         markdown = blocknote_to_markdown(document.blocknote_document)
         if markdown:
-            # Persist the migration so we don't repeat it
             document.source_markdown = markdown
             await session.commit()
-            return {
-                "document_id": document.id,
-                "title": document.title,
-                "document_type": document.document_type.value,
-                "source_markdown": markdown,
-                "updated_at": document.updated_at.isoformat()
-                if document.updated_at
-                else None,
-            }
+            return _build_response(markdown)
 
-    # Priority 3: For NOTE type with no content, return empty markdown
     if document.document_type == DocumentType.NOTE:
         empty_markdown = ""
         document.source_markdown = empty_markdown
         await session.commit()
-        return {
-            "document_id": document.id,
-            "title": document.title,
-            "document_type": document.document_type.value,
-            "source_markdown": empty_markdown,
-            "updated_at": document.updated_at.isoformat()
-            if document.updated_at
-            else None,
-        }
+        return _build_response(empty_markdown)
 
-    # Priority 4: Reconstruct from chunks
-    chunks = sorted(document.chunks, key=lambda c: c.id)
+    chunk_contents_result = await session.execute(
+        select(Chunk.content)
+        .filter(Chunk.document_id == document_id)
+        .order_by(Chunk.id)
+    )
+    chunk_contents = chunk_contents_result.scalars().all()
 
-    if not chunks:
+    if not chunk_contents:
         raise HTTPException(
             status_code=400,
             detail="This document has no content and cannot be edited. Please re-upload to enable editing.",
         )
 
-    markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+    markdown_content = "\n\n".join(chunk_contents)
 
     if not markdown_content.strip():
         raise HTTPException(
@@ -140,17 +141,77 @@ async def get_editor_content(
             detail="This document has empty content and cannot be edited.",
         )
 
-    # Persist the lazy migration
     document.source_markdown = markdown_content
     await session.commit()
 
-    return {
-        "document_id": document.id,
-        "title": document.title,
-        "document_type": document.document_type.value,
-        "source_markdown": markdown_content,
-        "updated_at": document.updated_at.isoformat() if document.updated_at else None,
-    }
+    return _build_response(markdown_content)
+
+
+@router.get(
+    "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown"
+)
+async def download_document_markdown(
+    search_space_id: int,
+    document_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Download the full document content as a .md file.
+    Reconstructs markdown from source_markdown or chunks.
+    """
+    await check_permission(
+        session,
+        user,
+        search_space_id,
+        Permission.DOCUMENTS_READ.value,
+        "You don't have permission to read documents in this search space",
+    )
+
+    result = await session.execute(
+        select(Document).filter(
+            Document.id == document_id,
+            Document.search_space_id == search_space_id,
+        )
+    )
+    document = result.scalars().first()
+
+    if not document:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    markdown: str | None = document.source_markdown
+    if markdown is None and document.blocknote_document:
+        from app.utils.blocknote_to_markdown import blocknote_to_markdown
+
+        markdown = blocknote_to_markdown(document.blocknote_document)
+    if markdown is None:
+        chunk_contents_result = await session.execute(
+            select(Chunk.content)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.id)
+        )
+        chunk_contents = chunk_contents_result.scalars().all()
+        if chunk_contents:
+            markdown = "\n\n".join(chunk_contents)
+
+    if not markdown or not markdown.strip():
+        raise HTTPException(
+            status_code=400, detail="Document has no content to download"
+        )
+
+    safe_title = (
+        "".join(
+            c if c.isalnum() or c in " -_" else "_"
+            for c in (document.title or "document")
+        ).strip()[:80]
+        or "document"
+    )
+
+    return StreamingResponse(
+        io.BytesIO(markdown.encode("utf-8")),
+        media_type="text/markdown; charset=utf-8",
+        headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'},
+    )
 
 
 @router.post("/search-spaces/{search_space_id}/documents/{document_id}/save")
@@ -258,9 +319,7 @@ async def export_document(
     )
 
     result = await session.execute(
-        select(Document)
-        .options(selectinload(Document.chunks))
-        .filter(
+        select(Document).filter(
             Document.id == document_id,
             Document.search_space_id == search_space_id,
         )
@@ -269,16 +328,20 @@ async def export_document(
     if not document:
         raise HTTPException(status_code=404, detail="Document not found")
 
-    # Resolve markdown content (same priority as editor-content endpoint)
     markdown_content: str | None = document.source_markdown
     if markdown_content is None and document.blocknote_document:
         from app.utils.blocknote_to_markdown import blocknote_to_markdown
 
         markdown_content = blocknote_to_markdown(document.blocknote_document)
     if markdown_content is None:
-        chunks = sorted(document.chunks, key=lambda c: c.id)
-        if chunks:
-            markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+        chunk_contents_result = await session.execute(
+            select(Chunk.content)
+            .filter(Chunk.document_id == document_id)
+            .order_by(Chunk.id)
+        )
+        chunk_contents = chunk_contents_result.scalars().all()
+        if chunk_contents:
+            markdown_content = "\n\n".join(chunk_contents)
 
     if not markdown_content or not markdown_content.strip():
         raise HTTPException(status_code=400, detail="Document has no content to export")
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index c022a09d2..49d2836b2 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -53,25 +53,26 @@ class DocumentRead(BaseModel):
     title: str
     document_type: DocumentType
     document_metadata: dict
-    content: str  # Changed to string to match frontend
+    content: str = ""
+    content_preview: str = ""
     content_hash: str
     unique_identifier_hash: str | None
     created_at: datetime
     updated_at: datetime | None
     search_space_id: int
     folder_id: int | None = None
-    created_by_id: UUID | None = None  # User who created/uploaded this document
+    created_by_id: UUID | None = None
     created_by_name: str | None = None
     created_by_email: str | None = None
-    status: DocumentStatusSchema | None = (
-        None  # Processing status (ready, processing, failed)
-    )
+    status: DocumentStatusSchema | None = None
 
     model_config = ConfigDict(from_attributes=True)
 
 
 class DocumentWithChunksRead(DocumentRead):
     chunks: list[ChunkRead] = []
+    total_chunks: int = 0
+    chunk_start_index: int = 0
 
     model_config = ConfigDict(from_attributes=True)
 
diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index 7c1e3b7ea..5ff907459 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import (
 )
 from app.db import (
     ChatVisibility,
-    Document,
     NewChatMessage,
     NewChatThread,
     Report,
@@ -63,74 +62,6 @@ _perf_log = get_perf_logger()
 _background_tasks: set[asyncio.Task] = set()
 
 
-def format_mentioned_documents_as_context(documents: list[Document]) -> str:
-    """
-    Format mentioned documents as context for the agent.
-
-    Uses the same XML structure as knowledge_base.format_documents_for_context
-    to ensure citations work properly with chunk IDs.
-    """
-    if not documents:
-        return ""
-
-    context_parts = ["<mentioned_documents>"]
-    context_parts.append(
-        "The user has explicitly mentioned the following documents from their knowledge base. "
-        "These documents are directly relevant to the query and should be prioritized as primary sources. "
-        "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
-    )
-    context_parts.append("")
-
-    for doc in documents:
-        # Build metadata JSON
-        metadata = doc.document_metadata or {}
-        metadata_json = json.dumps(metadata, ensure_ascii=False)
-
-        # Get URL from metadata
-        url = (
-            metadata.get("url")
-            or metadata.get("source")
-            or metadata.get("page_url")
-            or ""
-        )
-
-        context_parts.append("<document>")
-        context_parts.append("<document_metadata>")
-        context_parts.append(f"  <document_id>{doc.id}</document_id>")
-        context_parts.append(
-            f"  <document_type>{doc.document_type.value}</document_type>"
-        )
-        context_parts.append(f"  <title><![CDATA[{doc.title}]]></title>")
-        context_parts.append(f"  <url><![CDATA[{url}]]></url>")
-        context_parts.append(
-            f"  <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
-        )
-        context_parts.append("</document_metadata>")
-        context_parts.append("")
-        context_parts.append("<document_content>")
-
-        # Use chunks if available (preferred for proper citations)
-        if hasattr(doc, "chunks") and doc.chunks:
-            for chunk in doc.chunks:
-                context_parts.append(
-                    f"  <chunk id='{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
-                )
-        else:
-            # Fallback to document content if chunks not loaded
-            # Use document ID as chunk ID prefix for consistency
-            context_parts.append(
-                f"  <chunk id='{doc.id}'><![CDATA[{doc.content}]]></chunk>"
-            )
-
-        context_parts.append("</document_content>")
-        context_parts.append("</document>")
-        context_parts.append("")
-
-    context_parts.append("</mentioned_documents>")
-
-    return "\n".join(context_parts)
-
-
 def format_mentioned_surfsense_docs_as_context(
     documents: list[SurfsenseDocsDocument],
 ) -> str:
@@ -1317,6 +1248,7 @@ async def stream_new_chat(
             firecrawl_api_key=firecrawl_api_key,
             thread_visibility=visibility,
             disabled_tools=disabled_tools,
+            mentioned_document_ids=mentioned_document_ids,
         )
         _perf_log.info(
             "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
@@ -1340,18 +1272,9 @@ async def stream_new_chat(
                 thread.needs_history_bootstrap = False
                 await session.commit()
 
-        # Fetch mentioned documents if any (with chunks for proper citations)
-        mentioned_documents: list[Document] = []
-        if mentioned_document_ids:
-            result = await session.execute(
-                select(Document)
-                .options(selectinload(Document.chunks))
-                .filter(
-                    Document.id.in_(mentioned_document_ids),
-                    Document.search_space_id == search_space_id,
-                )
-            )
-            mentioned_documents = list(result.scalars().all())
+        # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
+        # which merges them into the scoped filesystem with full document
+        # structure. Only SurfSense docs and report context are inlined here.
 
         # Fetch mentioned SurfSense docs if any
         mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
@@ -1379,15 +1302,10 @@ async def stream_new_chat(
         )
         recent_reports = list(recent_reports_result.scalars().all())
 
-        # Format the user query with context (mentioned documents + SurfSense docs)
+        # Format the user query with context (SurfSense docs + reports only)
         final_query = user_query
         context_parts = []
 
-        if mentioned_documents:
-            context_parts.append(
-                format_mentioned_documents_as_context(mentioned_documents)
-            )
-
         if mentioned_surfsense_docs:
             context_parts.append(
                 format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
@@ -1479,7 +1397,7 @@ async def stream_new_chat(
         yield streaming_service.format_start_step()
 
         # Initial thinking step - analyzing the request
-        if mentioned_documents or mentioned_surfsense_docs:
+        if mentioned_surfsense_docs:
             initial_title = "Analyzing referenced content"
             action_verb = "Analyzing"
         else:
@@ -1490,18 +1408,6 @@ async def stream_new_chat(
         query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
         processing_parts.append(query_text)
 
-        if mentioned_documents:
-            doc_names = []
-            for doc in mentioned_documents:
-                title = doc.title
-                if len(title) > 30:
-                    title = title[:27] + "..."
-                doc_names.append(title)
-            if len(doc_names) == 1:
-                processing_parts.append(f"[{doc_names[0]}]")
-            else:
-                processing_parts.append(f"[{len(doc_names)} documents]")
-
         if mentioned_surfsense_docs:
             doc_names = []
             for doc in mentioned_surfsense_docs:
@@ -1527,7 +1433,7 @@ async def stream_new_chat(
         # These ORM objects (with eagerly-loaded chunks) can be very large.
         # They're only needed to build context strings already copied into
         # final_query / langchain_messages — release them before streaming.
-        del mentioned_documents, mentioned_surfsense_docs, recent_reports
+        del mentioned_surfsense_docs, recent_reports
         del langchain_messages, final_query
 
         # Check if this is the first assistant response so we can generate
diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py
index e70c41cb4..2b5690d02 100644
--- a/surfsense_backend/app/tasks/document_processors/__init__.py
+++ b/surfsense_backend/app/tasks/document_processors/__init__.py
@@ -12,16 +12,14 @@ Available processors:
 - YouTube processor: Process YouTube videos and extract transcripts
 """
 
-# URL crawler
 # Extension processor
-from .extension_processor import add_extension_received_document
-
-# File processors
-from .file_processors import (
+# File processors (backward-compatible re-exports from _save)
+from ._save import (
     add_received_file_document_using_docling,
     add_received_file_document_using_llamacloud,
     add_received_file_document_using_unstructured,
 )
+from .extension_processor import add_extension_received_document
 
 # Markdown processor
 from .markdown_processor import add_received_markdown_file_document
@@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document
 __all__ = [
     # Extension processing
     "add_extension_received_document",
+    # File processing with different ETL services
     "add_received_file_document_using_docling",
     "add_received_file_document_using_llamacloud",
-    # File processing with different ETL services
     "add_received_file_document_using_unstructured",
     # Markdown file processing
     "add_received_markdown_file_document",
diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py
new file mode 100644
index 000000000..f74d7acce
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_constants.py
@@ -0,0 +1,74 @@
+"""
+Constants for file document processing.
+
+Centralizes file type classification, LlamaCloud retry configuration,
+and timeout calculation parameters.
+"""
+
+import ssl
+from enum import Enum
+
+import httpx
+
+# ---------------------------------------------------------------------------
+# File type classification
+# ---------------------------------------------------------------------------
+
+MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
+AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
+DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
+
+
+class FileCategory(Enum):
+    MARKDOWN = "markdown"
+    AUDIO = "audio"
+    DIRECT_CONVERT = "direct_convert"
+    DOCUMENT = "document"
+
+
+def classify_file(filename: str) -> FileCategory:
+    """Classify a file by its extension into a processing category."""
+    lower = filename.lower()
+    if lower.endswith(MARKDOWN_EXTENSIONS):
+        return FileCategory.MARKDOWN
+    if lower.endswith(AUDIO_EXTENSIONS):
+        return FileCategory.AUDIO
+    if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
+        return FileCategory.DIRECT_CONVERT
+    return FileCategory.DOCUMENT
+
+
+# ---------------------------------------------------------------------------
+# LlamaCloud retry configuration
+# ---------------------------------------------------------------------------
+
+LLAMACLOUD_MAX_RETRIES = 5
+LLAMACLOUD_BASE_DELAY = 10  # seconds (exponential backoff base)
+LLAMACLOUD_MAX_DELAY = 120  # max delay between retries (2 minutes)
+LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
+    ssl.SSLError,
+    httpx.ConnectError,
+    httpx.ConnectTimeout,
+    httpx.ReadError,
+    httpx.ReadTimeout,
+    httpx.WriteError,
+    httpx.WriteTimeout,
+    httpx.RemoteProtocolError,
+    httpx.LocalProtocolError,
+    ConnectionError,
+    ConnectionResetError,
+    TimeoutError,
+    OSError,
+)
+
+# ---------------------------------------------------------------------------
+# Timeout calculation constants
+# ---------------------------------------------------------------------------
+
+UPLOAD_BYTES_PER_SECOND_SLOW = (
+    100 * 1024
+)  # 100 KB/s (conservative for slow connections)
+MIN_UPLOAD_TIMEOUT = 120  # Minimum 2 minutes for any file
+MAX_UPLOAD_TIMEOUT = 1800  # Maximum 30 minutes for very large files
+BASE_JOB_TIMEOUT = 600  # 10 minutes base for job processing
+PER_PAGE_JOB_TIMEOUT = 60  # 1 minute per page for processing
diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
new file mode 100644
index 000000000..b1a69ef4f
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -0,0 +1,90 @@
+"""
+Lossless file-to-markdown converters for text-based formats.
+
+These converters handle file types that can be faithfully represented as
+markdown without any external ETL/OCR service:
+
+- CSV / TSV  → markdown table  (stdlib ``csv``)
+- HTML / HTM → markdown        (``markdownify``)
+"""
+
+from __future__ import annotations
+
+import csv
+from collections.abc import Callable
+from pathlib import Path
+
+from markdownify import markdownify
+
+# The stdlib csv module defaults to a 128 KB field-size limit which is too
+# small for real-world exports (e.g. chat logs, CRM dumps).  We raise it once
+# at import time so every csv.reader call in this module can handle large fields.
+csv.field_size_limit(2**31 - 1)
+
+
+def _escape_pipe(cell: str) -> str:
+    """Escape literal pipe characters inside a markdown table cell."""
+    return cell.replace("|", "\\|")
+
+
+def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
+    """Convert a CSV (or TSV) file to a markdown table.
+
+    The first row is treated as the header.  An empty file returns an
+    empty string so the caller can decide how to handle it.
+    """
+    with open(file_path, encoding="utf-8", newline="") as fh:
+        reader = csv.reader(fh, delimiter=delimiter)
+        rows = list(reader)
+
+    if not rows:
+        return ""
+
+    header, *body = rows
+    col_count = len(header)
+
+    lines: list[str] = []
+
+    header_cells = [_escape_pipe(c.strip()) for c in header]
+    lines.append("| " + " | ".join(header_cells) + " |")
+    lines.append("| " + " | ".join(["---"] * col_count) + " |")
+
+    for row in body:
+        padded = row + [""] * (col_count - len(row))
+        cells = [_escape_pipe(c.strip()) for c in padded[:col_count]]
+        lines.append("| " + " | ".join(cells) + " |")
+
+    return "\n".join(lines) + "\n"
+
+
+def tsv_to_markdown(file_path: str) -> str:
+    """Convert a TSV file to a markdown table."""
+    return csv_to_markdown(file_path, delimiter="\t")
+
+
+def html_to_markdown(file_path: str) -> str:
+    """Convert an HTML file to markdown via ``markdownify``."""
+    html = Path(file_path).read_text(encoding="utf-8")
+    return markdownify(html).strip()
+
+
+_CONVERTER_MAP: dict[str, Callable[..., str]] = {
+    ".csv": csv_to_markdown,
+    ".tsv": tsv_to_markdown,
+    ".html": html_to_markdown,
+    ".htm": html_to_markdown,
+}
+
+
+def convert_file_directly(file_path: str, filename: str) -> str:
+    """Dispatch to the appropriate lossless converter based on file extension.
+
+    Raises ``ValueError`` if the extension is not supported.
+    """
+    suffix = Path(filename).suffix.lower()
+    converter = _CONVERTER_MAP.get(suffix)
+    if converter is None:
+        raise ValueError(
+            f"No direct converter for extension '{suffix}' (file: {filename})"
+        )
+    return converter(file_path)
diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py
new file mode 100644
index 000000000..cc3a8b1ac
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_etl.py
@@ -0,0 +1,209 @@
+"""
+ETL parsing strategies for different document processing services.
+
+Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
+LlamaCloud retry logic and dynamic timeout calculations.
+"""
+
+import asyncio
+import logging
+import os
+import random
+import warnings
+from logging import ERROR, getLogger
+
+import httpx
+
+from app.config import config as app_config
+from app.db import Log
+from app.services.task_logging_service import TaskLoggingService
+
+from ._constants import (
+    LLAMACLOUD_BASE_DELAY,
+    LLAMACLOUD_MAX_DELAY,
+    LLAMACLOUD_MAX_RETRIES,
+    LLAMACLOUD_RETRYABLE_EXCEPTIONS,
+    PER_PAGE_JOB_TIMEOUT,
+)
+from ._helpers import calculate_job_timeout, calculate_upload_timeout
+
+# ---------------------------------------------------------------------------
+# LlamaCloud parsing with retry
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_llamacloud_retry(
+    file_path: str,
+    estimated_pages: int,
+    task_logger: TaskLoggingService | None = None,
+    log_entry: Log | None = None,
+):
+    """
+    Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
+
+    Uses dynamic timeout calculations based on file size and page count to handle
+    very large files reliably.
+
+    Returns:
+        LlamaParse result object
+
+    Raises:
+        Exception: If all retries fail
+    """
+    from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.utils import ResultType
+
+    file_size_bytes = os.path.getsize(file_path)
+    file_size_mb = file_size_bytes / (1024 * 1024)
+
+    upload_timeout = calculate_upload_timeout(file_size_bytes)
+    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
+
+    custom_timeout = httpx.Timeout(
+        connect=120.0,
+        read=upload_timeout,
+        write=upload_timeout,
+        pool=120.0,
+    )
+
+    logging.info(
+        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
+        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
+        f"job_timeout={job_timeout:.0f}s"
+    )
+
+    last_exception = None
+    attempt_errors: list[str] = []
+
+    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
+        try:
+            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
+                parser = LlamaParse(
+                    api_key=app_config.LLAMA_CLOUD_API_KEY,
+                    num_workers=1,
+                    verbose=True,
+                    language="en",
+                    result_type=ResultType.MD,
+                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
+                    job_timeout_in_seconds=job_timeout,
+                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
+                    custom_client=custom_client,
+                )
+                result = await parser.aparse(file_path)
+
+                if attempt > 1:
+                    logging.info(
+                        f"LlamaCloud upload succeeded on attempt {attempt} after "
+                        f"{len(attempt_errors)} failures"
+                    )
+                return result
+
+        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
+            last_exception = e
+            error_type = type(e).__name__
+            error_msg = str(e)[:200]
+            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
+
+            if attempt < LLAMACLOUD_MAX_RETRIES:
+                base_delay = min(
+                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
+                    LLAMACLOUD_MAX_DELAY,
+                )
+                jitter = base_delay * 0.25 * (2 * random.random() - 1)
+                delay = base_delay + jitter
+
+                if task_logger and log_entry:
+                    await task_logger.log_task_progress(
+                        log_entry,
+                        f"LlamaCloud upload failed "
+                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
+                        f"retrying in {delay:.0f}s",
+                        {
+                            "error_type": error_type,
+                            "error_message": error_msg,
+                            "attempt": attempt,
+                            "retry_delay": delay,
+                            "file_size_mb": round(file_size_mb, 1),
+                            "upload_timeout": upload_timeout,
+                        },
+                    )
+                else:
+                    logging.warning(
+                        f"LlamaCloud upload failed "
+                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
+                        f"{error_type}. File: {file_size_mb:.1f}MB. "
+                        f"Retrying in {delay:.0f}s..."
+                    )
+
+                await asyncio.sleep(delay)
+            else:
+                logging.error(
+                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
+                    f"attempts. File size: {file_size_mb:.1f}MB, "
+                    f"Pages: {estimated_pages}. "
+                    f"Errors: {'; '.join(attempt_errors)}"
+                )
+
+        except Exception:
+            raise
+
+    raise last_exception or RuntimeError(
+        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
+        f"File size: {file_size_mb:.1f}MB"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Per-service parse functions
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_unstructured(file_path: str):
+    """
+    Parse a file using the Unstructured ETL service.
+
+    Returns:
+        List of LangChain Document elements.
+    """
+    from langchain_unstructured import UnstructuredLoader
+
+    loader = UnstructuredLoader(
+        file_path,
+        mode="elements",
+        post_processors=[],
+        languages=["eng"],
+        include_orig_elements=False,
+        include_metadata=False,
+        strategy="auto",
+    )
+    return await loader.aload()
+
+
+async def parse_with_docling(file_path: str, filename: str) -> str:
+    """
+    Parse a file using the Docling ETL service (via the Docling service wrapper).
+
+    Returns:
+        Markdown content string.
+    """
+    from app.services.docling_service import create_docling_service
+
+    docling_service = create_docling_service()
+
+    pdfminer_logger = getLogger("pdfminer")
+    original_level = pdfminer_logger.level
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
+        warnings.filterwarnings(
+            "ignore", message=".*Cannot set gray non-stroke color.*"
+        )
+        warnings.filterwarnings("ignore", message=".*invalid float value.*")
+        pdfminer_logger.setLevel(ERROR)
+
+        try:
+            result = await docling_service.process_document(file_path, filename)
+        finally:
+            pdfminer_logger.setLevel(original_level)
+
+    return result["content"]
diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py
new file mode 100644
index 000000000..7ac05932c
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_helpers.py
@@ -0,0 +1,218 @@
+"""
+Document helper functions for deduplication, migration, and connector updates.
+
+Provides reusable logic shared across file processors and ETL strategies.
+"""
+
+import logging
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.utils.document_converters import generate_unique_identifier_hash
+
+from ._constants import (
+    BASE_JOB_TIMEOUT,
+    MAX_UPLOAD_TIMEOUT,
+    MIN_UPLOAD_TIMEOUT,
+    PER_PAGE_JOB_TIMEOUT,
+    UPLOAD_BYTES_PER_SECOND_SLOW,
+)
+from .base import (
+    check_document_by_unique_identifier,
+    check_duplicate_document,
+)
+
+# ---------------------------------------------------------------------------
+# Unique identifier helpers
+# ---------------------------------------------------------------------------
+
+
+def get_google_drive_unique_identifier(
+    connector: dict | None,
+    filename: str,
+    search_space_id: int,
+) -> tuple[str, str | None]:
+    """
+    Get unique identifier hash, using file_id for Google Drive (stable across renames).
+
+    Returns:
+        Tuple of (primary_hash, legacy_hash or None).
+        For Google Drive: (file_id-based hash, filename-based hash for migration).
+        For other sources: (filename-based hash, None).
+    """
+    if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+        metadata = connector.get("metadata", {})
+        file_id = metadata.get("google_drive_file_id")
+
+        if file_id:
+            primary_hash = generate_unique_identifier_hash(
+                DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
+            )
+            legacy_hash = generate_unique_identifier_hash(
+                DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
+            )
+            return primary_hash, legacy_hash
+
+    primary_hash = generate_unique_identifier_hash(
+        DocumentType.FILE, filename, search_space_id
+    )
+    return primary_hash, None
+
+
+# ---------------------------------------------------------------------------
+# Document deduplication and migration
+# ---------------------------------------------------------------------------
+
+
+async def handle_existing_document_update(
+    session: AsyncSession,
+    existing_document: Document,
+    content_hash: str,
+    connector: dict | None,
+    filename: str,
+    primary_hash: str,
+) -> tuple[bool, Document | None]:
+    """
+    Handle update logic for an existing document.
+
+    Returns:
+        Tuple of (should_skip_processing, document_to_return):
+        - (True, document): Content unchanged, return existing document
+        - (False, None): Content changed, needs re-processing
+    """
+    if existing_document.unique_identifier_hash != primary_hash:
+        existing_document.unique_identifier_hash = primary_hash
+        logging.info(f"Migrated document to file_id-based identifier: {filename}")
+
+    if existing_document.content_hash == content_hash:
+        if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+            connector_metadata = connector.get("metadata", {})
+            new_name = connector_metadata.get("google_drive_file_name")
+            doc_metadata = existing_document.document_metadata or {}
+            old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
+                "google_drive_file_name"
+            )
+
+            if new_name and old_name and old_name != new_name:
+                from sqlalchemy.orm.attributes import flag_modified
+
+                existing_document.title = new_name
+                if not existing_document.document_metadata:
+                    existing_document.document_metadata = {}
+                existing_document.document_metadata["FILE_NAME"] = new_name
+                existing_document.document_metadata["google_drive_file_name"] = new_name
+                flag_modified(existing_document, "document_metadata")
+                await session.commit()
+                logging.info(
+                    f"File renamed in Google Drive: '{old_name}' → '{new_name}' "
+                    f"(no re-processing needed)"
+                )
+
+        logging.info(f"Document for file {filename} unchanged. Skipping.")
+        return True, existing_document
+
+    # Content has changed — guard against content_hash collision before
+    # expensive ETL processing.
+    collision_doc = await check_duplicate_document(session, content_hash)
+    if collision_doc and collision_doc.id != existing_document.id:
+        logging.warning(
+            "Content-hash collision for %s: identical content exists in "
+            "document #%s (%s). Skipping re-processing.",
+            filename,
+            collision_doc.id,
+            collision_doc.document_type,
+        )
+        if DocumentStatus.is_state(
+            existing_document.status, DocumentStatus.PENDING
+        ) or DocumentStatus.is_state(
+            existing_document.status, DocumentStatus.PROCESSING
+        ):
+            await session.delete(existing_document)
+            await session.commit()
+            return True, None
+
+        return True, existing_document
+
+    logging.info(f"Content changed for file {filename}. Updating document.")
+    return False, None
+
+
+async def find_existing_document_with_migration(
+    session: AsyncSession,
+    primary_hash: str,
+    legacy_hash: str | None,
+    content_hash: str | None = None,
+) -> Document | None:
+    """
+    Find existing document, checking primary hash, legacy hash, and content_hash.
+
+    Supports migration from filename-based to file_id-based hashing for
+    Google Drive files, with content_hash fallback for cross-source dedup.
+    """
+    existing_document = await check_document_by_unique_identifier(session, primary_hash)
+
+    if not existing_document and legacy_hash:
+        existing_document = await check_document_by_unique_identifier(
+            session, legacy_hash
+        )
+        if existing_document:
+            logging.info(
+                "Found legacy document (filename-based hash), "
+                "will migrate to file_id-based hash"
+            )
+
+    if not existing_document and content_hash:
+        existing_document = await check_duplicate_document(session, content_hash)
+        if existing_document:
+            logging.info(
+                f"Found duplicate content from different source (content_hash match). "
+                f"Original document ID: {existing_document.id}, "
+                f"type: {existing_document.document_type}"
+            )
+
+    return existing_document
+
+
+# ---------------------------------------------------------------------------
+# Connector helpers
+# ---------------------------------------------------------------------------
+
+
+async def update_document_from_connector(
+    document: Document | None,
+    connector: dict | None,
+    session: AsyncSession,
+) -> None:
+    """Update document type, metadata, and connector_id from connector info."""
+    if not document or not connector:
+        return
+    if "type" in connector:
+        document.document_type = connector["type"]
+    if "metadata" in connector:
+        if not document.document_metadata:
+            document.document_metadata = connector["metadata"]
+        else:
+            merged = {**document.document_metadata, **connector["metadata"]}
+            document.document_metadata = merged
+    if "connector_id" in connector:
+        document.connector_id = connector["connector_id"]
+    await session.commit()
+
+
+# ---------------------------------------------------------------------------
+# Timeout calculations
+# ---------------------------------------------------------------------------
+
+
+def calculate_upload_timeout(file_size_bytes: int) -> float:
+    """Calculate upload timeout based on file size (conservative for slow connections)."""
+    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
+    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
+
+
+def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
+    """Calculate job processing timeout based on page count and file size."""
+    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
+    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
+    return max(page_based_timeout, size_based_timeout)
diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py
new file mode 100644
index 000000000..5088ad004
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@@ -0,0 +1,285 @@
+"""
+Unified document save/update logic for file processors.
+
+Replaces the three nearly-identical ``add_received_file_document_using_*``
+functions with a single ``save_file_document`` function plus thin wrappers
+for backward compatibility.
+"""
+
+import logging
+
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.services.llm_service import get_user_long_context_llm
+from app.utils.document_converters import (
+    create_document_chunks,
+    embed_text,
+    generate_content_hash,
+    generate_document_summary,
+)
+
+from ._helpers import (
+    find_existing_document_with_migration,
+    get_google_drive_unique_identifier,
+    handle_existing_document_update,
+)
+from .base import get_current_timestamp, safe_set_chunks
+
+# ---------------------------------------------------------------------------
+# Summary generation
+# ---------------------------------------------------------------------------
+
+
+async def _generate_summary(
+    markdown_content: str,
+    file_name: str,
+    etl_service: str,
+    user_llm,
+    enable_summary: bool,
+) -> tuple[str, list[float]]:
+    """
+    Generate a document summary and embedding.
+
+    Docling uses its own large-document summary strategy; other ETL services
+    use the standard ``generate_document_summary`` helper.
+    """
+    if not enable_summary:
+        summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
+        return summary, embed_text(summary)
+
+    if etl_service == "DOCLING":
+        from app.services.docling_service import create_docling_service
+
+        docling_service = create_docling_service()
+        summary_text = await docling_service.process_large_document_summary(
+            content=markdown_content, llm=user_llm, document_title=file_name
+        )
+
+        meta = {
+            "file_name": file_name,
+            "etl_service": etl_service,
+            "document_type": "File Document",
+        }
+        parts = ["# DOCUMENT METADATA"]
+        for key, value in meta.items():
+            if value:
+                formatted_key = key.replace("_", " ").title()
+                parts.append(f"**{formatted_key}:** {value}")
+
+        enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
+        return enhanced, embed_text(enhanced)
+
+    # Standard summary (Unstructured / LlamaCloud / others)
+    meta = {
+        "file_name": file_name,
+        "etl_service": etl_service,
+        "document_type": "File Document",
+    }
+    return await generate_document_summary(markdown_content, user_llm, meta)
+
+
+# ---------------------------------------------------------------------------
+# Unified save function
+# ---------------------------------------------------------------------------
+
+
+async def save_file_document(
+    session: AsyncSession,
+    file_name: str,
+    markdown_content: str,
+    search_space_id: int,
+    user_id: str,
+    etl_service: str,
+    connector: dict | None = None,
+    enable_summary: bool = True,
+) -> Document | None:
+    """
+    Process and store a file document with deduplication and migration support.
+
+    Handles both creating new documents and updating existing ones.  This is
+    the single implementation behind the per-ETL-service wrapper functions.
+
+    Args:
+        session: Database session
+        file_name: Name of the processed file
+        markdown_content: Markdown content to store
+        search_space_id: ID of the search space
+        user_id: ID of the user
+        etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
+        connector: Optional connector info for Google Drive files
+        enable_summary: Whether to generate an AI summary
+
+    Returns:
+        Document object if successful, None if duplicate detected
+    """
+    try:
+        primary_hash, legacy_hash = get_google_drive_unique_identifier(
+            connector, file_name, search_space_id
+        )
+        content_hash = generate_content_hash(markdown_content, search_space_id)
+
+        existing_document = await find_existing_document_with_migration(
+            session, primary_hash, legacy_hash, content_hash
+        )
+
+        if existing_document:
+            should_skip, doc = await handle_existing_document_update(
+                session,
+                existing_document,
+                content_hash,
+                connector,
+                file_name,
+                primary_hash,
+            )
+            if should_skip:
+                return doc
+
+        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
+        if not user_llm:
+            raise RuntimeError(
+                f"No long context LLM configured for user {user_id} "
+                f"in search space {search_space_id}"
+            )
+
+        summary_content, summary_embedding = await _generate_summary(
+            markdown_content, file_name, etl_service, user_llm, enable_summary
+        )
+        chunks = await create_document_chunks(markdown_content)
+        doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
+
+        if existing_document:
+            existing_document.title = file_name
+            existing_document.content = summary_content
+            existing_document.content_hash = content_hash
+            existing_document.embedding = summary_embedding
+            existing_document.document_metadata = doc_metadata
+            await safe_set_chunks(session, existing_document, chunks)
+            existing_document.source_markdown = markdown_content
+            existing_document.content_needs_reindexing = False
+            existing_document.updated_at = get_current_timestamp()
+            existing_document.status = DocumentStatus.ready()
+
+            await session.commit()
+            await session.refresh(existing_document)
+            return existing_document
+
+        doc_type = DocumentType.FILE
+        if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+            doc_type = DocumentType.GOOGLE_DRIVE_FILE
+
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=doc_type,
+            document_metadata=doc_metadata,
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks,
+            content_hash=content_hash,
+            unique_identifier_hash=primary_hash,
+            source_markdown=markdown_content,
+            content_needs_reindexing=False,
+            updated_at=get_current_timestamp(),
+            created_by_id=user_id,
+            connector_id=connector.get("connector_id") if connector else None,
+            status=DocumentStatus.ready(),
+        )
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+        return document
+
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        if "ix_documents_content_hash" in str(db_error):
+            logging.warning(
+                "content_hash collision during commit for %s (%s). Skipping.",
+                file_name,
+                etl_service,
+            )
+            return None
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(
+            f"Failed to process file document using {etl_service}: {e!s}"
+        ) from e
+
+
+# ---------------------------------------------------------------------------
+# Backward-compatible wrapper functions
+# ---------------------------------------------------------------------------
+
+
+async def add_received_file_document_using_unstructured(
+    session: AsyncSession,
+    file_name: str,
+    unstructured_processed_elements: list[LangChainDocument],
+    search_space_id: int,
+    user_id: str,
+    connector: dict | None = None,
+    enable_summary: bool = True,
+) -> Document | None:
+    """Process and store a file document using the Unstructured service."""
+    from app.utils.document_converters import convert_document_to_markdown
+
+    markdown_content = await convert_document_to_markdown(
+        unstructured_processed_elements
+    )
+    return await save_file_document(
+        session,
+        file_name,
+        markdown_content,
+        search_space_id,
+        user_id,
+        "UNSTRUCTURED",
+        connector,
+        enable_summary,
+    )
+
+
+async def add_received_file_document_using_llamacloud(
+    session: AsyncSession,
+    file_name: str,
+    llamacloud_markdown_document: str,
+    search_space_id: int,
+    user_id: str,
+    connector: dict | None = None,
+    enable_summary: bool = True,
+) -> Document | None:
+    """Process and store document content parsed by LlamaCloud."""
+    return await save_file_document(
+        session,
+        file_name,
+        llamacloud_markdown_document,
+        search_space_id,
+        user_id,
+        "LLAMACLOUD",
+        connector,
+        enable_summary,
+    )
+
+
+async def add_received_file_document_using_docling(
+    session: AsyncSession,
+    file_name: str,
+    docling_markdown_document: str,
+    search_space_id: int,
+    user_id: str,
+    connector: dict | None = None,
+    enable_summary: bool = True,
+) -> Document | None:
+    """Process and store document content parsed by Docling."""
+    return await save_file_document(
+        session,
+        file_name,
+        docling_markdown_document,
+        search_space_id,
+        user_id,
+        "DOCLING",
+        connector,
+        enable_summary,
+    )
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 6c0ae1870..0c1cad52d 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -1,905 +1,685 @@
 """
-File document processors for different ETL services (Unstructured, LlamaCloud, Docling).
+File document processors orchestrating content extraction and indexing.
+
+This module is the public entry point for file processing.  It delegates to
+specialised sub-modules that each own a single concern:
+
+- ``_constants``          — file type classification and configuration constants
+- ``_helpers``            — document deduplication, migration, connector helpers
+- ``_direct_converters``  — lossless file-to-markdown for csv/tsv/html
+- ``_etl``               — ETL parsing strategies (Unstructured, LlamaCloud, Docling)
+- ``_save``              — unified document creation / update logic
 """
 
-import asyncio
+from __future__ import annotations
+
 import contextlib
 import logging
-import ssl
-import warnings
+import os
+from dataclasses import dataclass, field
 from logging import ERROR, getLogger
 
-import httpx
 from fastapi import HTTPException
-from langchain_core.documents import Document as LangChainDocument
-from litellm import atranscription
-from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.config import config as app_config
-from app.db import Document, DocumentStatus, DocumentType, Log, Notification
-from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
-from app.services.llm_service import get_user_long_context_llm
+from app.db import Document, Log, Notification
 from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import (
-    convert_document_to_markdown,
-    create_document_chunks,
-    embed_text,
-    generate_content_hash,
-    generate_document_summary,
-    generate_unique_identifier_hash,
-)
 
-from .base import (
-    check_document_by_unique_identifier,
-    check_duplicate_document,
-    get_current_timestamp,
-    safe_set_chunks,
+from ._constants import FileCategory, classify_file
+from ._direct_converters import convert_file_directly
+from ._etl import (
+    parse_with_docling,
+    parse_with_llamacloud_retry,
+    parse_with_unstructured,
+)
+from ._helpers import update_document_from_connector
+from ._save import (
+    add_received_file_document_using_docling,
+    add_received_file_document_using_llamacloud,
+    add_received_file_document_using_unstructured,
+    save_file_document,
 )
 from .markdown_processor import add_received_markdown_file_document
 
-# Constants for LlamaCloud retry configuration
-LLAMACLOUD_MAX_RETRIES = 5  # Increased from 3 for large file resilience
-LLAMACLOUD_BASE_DELAY = 10  # Base delay in seconds for exponential backoff
-LLAMACLOUD_MAX_DELAY = 120  # Maximum delay between retries (2 minutes)
-LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
-    ssl.SSLError,
-    httpx.ConnectError,
-    httpx.ConnectTimeout,
-    httpx.ReadTimeout,
-    httpx.WriteTimeout,
-    httpx.RemoteProtocolError,
-    httpx.LocalProtocolError,
-    ConnectionError,
-    ConnectionResetError,
-    TimeoutError,
-    OSError,  # Catches various network-level errors
-)
-
-# Timeout calculation constants
-UPLOAD_BYTES_PER_SECOND_SLOW = (
-    100 * 1024
-)  # 100 KB/s (conservative for slow connections)
-MIN_UPLOAD_TIMEOUT = 120  # Minimum 2 minutes for any file
-MAX_UPLOAD_TIMEOUT = 1800  # Maximum 30 minutes for very large files
-BASE_JOB_TIMEOUT = 600  # 10 minutes base for job processing
-PER_PAGE_JOB_TIMEOUT = 60  # 1 minute per page for processing
+# Re-export public API so existing ``from file_processors import …`` keeps working.
+__all__ = [
+    "add_received_file_document_using_docling",
+    "add_received_file_document_using_llamacloud",
+    "add_received_file_document_using_unstructured",
+    "parse_with_llamacloud_retry",
+    "process_file_in_background",
+    "process_file_in_background_with_document",
+    "save_file_document",
+]
 
 
-def get_google_drive_unique_identifier(
-    connector: dict | None,
-    filename: str,
-    search_space_id: int,
-) -> tuple[str, str | None]:
-    """
-    Get unique identifier hash for a file, with special handling for Google Drive.
-
-    For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
-    For other files, uses filename.
-
-    Args:
-        connector: Optional connector info dict with type and metadata
-        filename: The filename (used for non-Google Drive files or as fallback)
-        search_space_id: The search space ID
-
-    Returns:
-        Tuple of (primary_hash, legacy_hash or None)
-        - For Google Drive: (file_id_based_hash, filename_based_hash for migration)
-        - For other sources: (filename_based_hash, None)
-    """
-    if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-        metadata = connector.get("metadata", {})
-        file_id = metadata.get("google_drive_file_id")
-
-        if file_id:
-            # New method: use file_id as unique identifier (doesn't change on rename)
-            primary_hash = generate_unique_identifier_hash(
-                DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
-            )
-            # Legacy method: for backward compatibility with existing documents
-            # that were indexed with filename-based hash
-            legacy_hash = generate_unique_identifier_hash(
-                DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
-            )
-            return primary_hash, legacy_hash
-
-    # For non-Google Drive files, use filename as before
-    primary_hash = generate_unique_identifier_hash(
-        DocumentType.FILE, filename, search_space_id
-    )
-    return primary_hash, None
+# ---------------------------------------------------------------------------
+# Processing context (bundles parameters shared across handler functions)
+# ---------------------------------------------------------------------------
 
 
-async def handle_existing_document_update(
-    session: AsyncSession,
-    existing_document: Document,
-    content_hash: str,
-    connector: dict | None,
-    filename: str,
-    primary_hash: str,
-) -> tuple[bool, Document | None]:
-    """
-    Handle update logic for an existing document.
+@dataclass
+class _ProcessingContext:
+    session: AsyncSession
+    file_path: str
+    filename: str
+    search_space_id: int
+    user_id: str
+    task_logger: TaskLoggingService
+    log_entry: Log
+    connector: dict | None = None
+    notification: Notification | None = None
+    enable_summary: bool = field(init=False)
 
-    Args:
-        session: Database session
-        existing_document: The existing document found in database
-        content_hash: Hash of the new content
-        connector: Optional connector info
-        filename: Current filename
-        primary_hash: The primary hash (file_id based for Google Drive)
-
-    Returns:
-        Tuple of (should_skip_processing, document_to_return)
-        - (True, document): Content unchanged, just return existing document
-        - (False, None): Content changed, need to re-process
-    """
-    # Check if this document needs hash migration (found via legacy hash)
-    if existing_document.unique_identifier_hash != primary_hash:
-        existing_document.unique_identifier_hash = primary_hash
-        logging.info(f"Migrated document to file_id-based identifier: {filename}")
-
-    # Check if content has changed
-    if existing_document.content_hash == content_hash:
-        # Content unchanged - check if we need to update metadata (e.g., filename changed)
-        if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-            connector_metadata = connector.get("metadata", {})
-            new_name = connector_metadata.get("google_drive_file_name")
-            # Check both possible keys for old name (FILE_NAME is used in stored documents)
-            doc_metadata = existing_document.document_metadata or {}
-            old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
-                "google_drive_file_name"
-            )
-
-            if new_name and old_name and old_name != new_name:
-                # File was renamed - update title and metadata, skip expensive processing
-                from sqlalchemy.orm.attributes import flag_modified
-
-                existing_document.title = new_name
-                if not existing_document.document_metadata:
-                    existing_document.document_metadata = {}
-                existing_document.document_metadata["FILE_NAME"] = new_name
-                existing_document.document_metadata["google_drive_file_name"] = new_name
-                flag_modified(existing_document, "document_metadata")
-                await session.commit()
-                logging.info(
-                    f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
-                )
-
-        logging.info(f"Document for file {filename} unchanged. Skipping.")
-        return True, existing_document
-    else:
-        # Content has changed — guard against content_hash collision before
-        # expensive ETL processing.  A collision means the exact same content
-        # already lives in a *different* document (e.g. a manual upload of the
-        # same file).  Proceeding would trigger a unique-constraint violation
-        # on ix_documents_content_hash.
-        collision_doc = await check_duplicate_document(session, content_hash)
-        if collision_doc and collision_doc.id != existing_document.id:
-            logging.warning(
-                "Content-hash collision for %s: identical content exists in "
-                "document #%s (%s). Skipping re-processing.",
-                filename,
-                collision_doc.id,
-                collision_doc.document_type,
-            )
-            if DocumentStatus.is_state(
-                existing_document.status, DocumentStatus.PENDING
-            ) or DocumentStatus.is_state(
-                existing_document.status, DocumentStatus.PROCESSING
-            ):
-                # Pending/processing doc has no real content yet — remove it
-                # so the UI doesn't show a contentless entry.
-                await session.delete(existing_document)
-                await session.commit()
-                return True, None
-
-            # Document already has valid content — keep it as-is.
-            return True, existing_document
-
-        logging.info(f"Content changed for file {filename}. Updating document.")
-        return False, None
-
-
-async def find_existing_document_with_migration(
-    session: AsyncSession,
-    primary_hash: str,
-    legacy_hash: str | None,
-    content_hash: str | None = None,
-) -> Document | None:
-    """
-    Find existing document, checking both new hash and legacy hash for migration,
-    with fallback to content_hash for cross-source deduplication.
-
-    Args:
-        session: Database session
-        primary_hash: The primary hash (file_id based for Google Drive)
-        legacy_hash: The legacy hash (filename based) for migration, or None
-        content_hash: The content hash for fallback deduplication, or None
-
-    Returns:
-        Existing document if found, None otherwise
-    """
-    # First check with primary hash (new method)
-    existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
-    # If not found and we have a legacy hash, check with that (migration path)
-    if not existing_document and legacy_hash:
-        existing_document = await check_document_by_unique_identifier(
-            session, legacy_hash
-        )
-        if existing_document:
-            logging.info(
-                "Found legacy document (filename-based hash), will migrate to file_id-based hash"
-            )
-
-    # Fallback: check by content_hash to catch duplicates from different sources
-    # This prevents unique constraint violations when the same content exists
-    # under a different unique_identifier (e.g., manual upload vs Google Drive)
-    if not existing_document and content_hash:
-        existing_document = await check_duplicate_document(session, content_hash)
-        if existing_document:
-            logging.info(
-                f"Found duplicate content from different source (content_hash match). "
-                f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
-            )
-
-    return existing_document
-
-
-def calculate_upload_timeout(file_size_bytes: int) -> float:
-    """
-    Calculate appropriate upload timeout based on file size.
-
-    Assumes a conservative slow connection speed to handle worst-case scenarios.
-
-    Args:
-        file_size_bytes: Size of the file in bytes
-
-    Returns:
-        Timeout in seconds
-    """
-    # Calculate time needed at slow connection speed
-    # Add 50% buffer for network variability and SSL overhead
-    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
-
-    # Clamp to reasonable bounds
-    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
-
-
-def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
-    """
-    Calculate job processing timeout based on page count and file size.
-
-    Args:
-        estimated_pages: Estimated number of pages
-        file_size_bytes: Size of the file in bytes
-
-    Returns:
-        Timeout in seconds
-    """
-    # Base timeout + time per page
-    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
-
-    # Also consider file size (large images take longer to process)
-    # ~1 minute per 10MB of file size
-    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
-
-    # Use the larger of the two estimates
-    return max(page_based_timeout, size_based_timeout)
-
-
-async def parse_with_llamacloud_retry(
-    file_path: str,
-    estimated_pages: int,
-    task_logger: TaskLoggingService | None = None,
-    log_entry: Log | None = None,
-):
-    """
-    Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
-
-    Uses dynamic timeout calculations based on file size and page count to handle
-    very large files reliably.
-
-    Args:
-        file_path: Path to the file to parse
-        estimated_pages: Estimated number of pages for timeout calculation
-        task_logger: Optional task logger for progress updates
-        log_entry: Optional log entry for progress updates
-
-    Returns:
-        LlamaParse result object
-
-    Raises:
-        Exception: If all retries fail
-    """
-    import os
-    import random
-
-    from llama_cloud_services import LlamaParse
-    from llama_cloud_services.parse.utils import ResultType
-
-    # Get file size for timeout calculations
-    file_size_bytes = os.path.getsize(file_path)
-    file_size_mb = file_size_bytes / (1024 * 1024)
-
-    # Calculate dynamic timeouts based on file size and page count
-    upload_timeout = calculate_upload_timeout(file_size_bytes)
-    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
-
-    # HTTP client timeouts - scaled based on file size
-    # Write timeout is critical for large file uploads
-    custom_timeout = httpx.Timeout(
-        connect=120.0,  # 2 minutes to establish connection (handles slow DNS, etc.)
-        read=upload_timeout,  # Dynamic based on file size
-        write=upload_timeout,  # Dynamic based on file size (upload time)
-        pool=120.0,  # 2 minutes to acquire connection from pool
-    )
-
-    logging.info(
-        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
-        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
-        f"job_timeout={job_timeout:.0f}s"
-    )
-
-    last_exception = None
-    attempt_errors = []
-
-    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
-        try:
-            # Create a fresh httpx client for each attempt
-            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
-                # Create LlamaParse parser instance with optimized settings
-                parser = LlamaParse(
-                    api_key=app_config.LLAMA_CLOUD_API_KEY,
-                    num_workers=1,  # Use single worker for file processing
-                    verbose=True,
-                    language="en",
-                    result_type=ResultType.MD,
-                    # Timeout settings for large files
-                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
-                    job_timeout_in_seconds=job_timeout,
-                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
-                    # Use our custom client with larger timeouts
-                    custom_client=custom_client,
-                )
-
-                # Parse the file asynchronously
-                result = await parser.aparse(file_path)
-
-                # Success - log if we had previous failures
-                if attempt > 1:
-                    logging.info(
-                        f"LlamaCloud upload succeeded on attempt {attempt} after "
-                        f"{len(attempt_errors)} failures"
-                    )
-
-                return result
-
-        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
-            last_exception = e
-            error_type = type(e).__name__
-            error_msg = str(e)[:200]
-            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
-
-            if attempt < LLAMACLOUD_MAX_RETRIES:
-                # Calculate exponential backoff with jitter
-                # Base delay doubles each attempt, capped at max delay
-                base_delay = min(
-                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY
-                )
-                # Add random jitter (±25%) to prevent thundering herd
-                jitter = base_delay * 0.25 * (2 * random.random() - 1)
-                delay = base_delay + jitter
-
-                if task_logger and log_entry:
-                    await task_logger.log_task_progress(
-                        log_entry,
-                        f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s",
-                        {
-                            "error_type": error_type,
-                            "error_message": error_msg,
-                            "attempt": attempt,
-                            "retry_delay": delay,
-                            "file_size_mb": round(file_size_mb, 1),
-                            "upload_timeout": upload_timeout,
-                        },
-                    )
-                else:
-                    logging.warning(
-                        f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
-                        f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..."
-                    )
-
-                await asyncio.sleep(delay)
-            else:
-                logging.error(
-                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. "
-                    f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. "
-                    f"Errors: {'; '.join(attempt_errors)}"
-                )
-
-        except Exception:
-            # Non-retryable exception, raise immediately
-            raise
-
-    # All retries exhausted
-    raise last_exception or RuntimeError(
-        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
-        f"File size: {file_size_mb:.1f}MB"
-    )
-
-
-async def add_received_file_document_using_unstructured(
-    session: AsyncSession,
-    file_name: str,
-    unstructured_processed_elements: list[LangChainDocument],
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """
-    Process and store a file document using Unstructured service.
-
-    Args:
-        session: Database session
-        file_name: Name of the processed file
-        unstructured_processed_elements: Processed elements from Unstructured
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        connector: Optional connector info for Google Drive files
-
-    Returns:
-        Document object if successful, None if failed
-    """
-    try:
-        file_in_markdown = await convert_document_to_markdown(
-            unstructured_processed_elements
+    def __post_init__(self) -> None:
+        self.enable_summary = (
+            self.connector.get("enable_summary", True) if self.connector else True
         )
 
-        # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
-        primary_hash, legacy_hash = get_google_drive_unique_identifier(
-            connector, file_name, search_space_id
-        )
 
-        # Generate content hash
-        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
-        # Check if document exists (with migration support for Google Drive and content_hash fallback)
-        existing_document = await find_existing_document_with_migration(
-            session, primary_hash, legacy_hash, content_hash
-        )
-
-        if existing_document:
-            # Handle existing document (rename detection, content change check)
-            should_skip, doc = await handle_existing_document_update(
-                session,
-                existing_document,
-                content_hash,
-                connector,
-                file_name,
-                primary_hash,
-            )
-            if should_skip:
-                return doc
-            # Content changed - continue to update
-
-        # Get user's long context LLM (needed for both create and update)
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search space {search_space_id}"
-            )
-
-        # Generate summary with metadata
-        document_metadata = {
-            "file_name": file_name,
-            "etl_service": "UNSTRUCTURED",
-            "document_type": "File Document",
-        }
-        if enable_summary:
-            summary_content, summary_embedding = await generate_document_summary(
-                file_in_markdown, user_llm, document_metadata
-            )
-        else:
-            summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
-            summary_embedding = embed_text(summary_content)
-
-        # Process chunks
-        chunks = await create_document_chunks(file_in_markdown)
-
-        # Update or create document
-        if existing_document:
-            # Update existing document
-            existing_document.title = file_name
-            existing_document.content = summary_content
-            existing_document.content_hash = content_hash
-            existing_document.embedding = summary_embedding
-            existing_document.document_metadata = {
-                "FILE_NAME": file_name,
-                "ETL_SERVICE": "UNSTRUCTURED",
-            }
-            await safe_set_chunks(session, existing_document, chunks)
-            existing_document.source_markdown = file_in_markdown
-            existing_document.content_needs_reindexing = False
-            existing_document.updated_at = get_current_timestamp()
-            existing_document.status = DocumentStatus.ready()
-
-            await session.commit()
-            await session.refresh(existing_document)
-            document = existing_document
-        else:
-            # Create new document
-            doc_type = DocumentType.FILE
-            if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-                doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
-            document = Document(
-                search_space_id=search_space_id,
-                title=file_name,
-                document_type=doc_type,
-                document_metadata={
-                    "FILE_NAME": file_name,
-                    "ETL_SERVICE": "UNSTRUCTURED",
-                },
-                content=summary_content,
-                embedding=summary_embedding,
-                chunks=chunks,
-                content_hash=content_hash,
-                unique_identifier_hash=primary_hash,
-                source_markdown=file_in_markdown,
-                content_needs_reindexing=False,
-                updated_at=get_current_timestamp(),
-                created_by_id=user_id,
-                connector_id=connector.get("connector_id") if connector else None,
-                status=DocumentStatus.ready(),
-            )
-
-            session.add(document)
-            await session.commit()
-            await session.refresh(document)
-
-        return document
-    except SQLAlchemyError as db_error:
-        await session.rollback()
-        if "ix_documents_content_hash" in str(db_error):
-            logging.warning(
-                "content_hash collision during commit for %s (Unstructured). Skipping.",
-                file_name,
-            )
-            return None
-        raise db_error
-    except Exception as e:
-        await session.rollback()
-        raise RuntimeError(f"Failed to process file document: {e!s}") from e
+# ---------------------------------------------------------------------------
+# Notification helper
+# ---------------------------------------------------------------------------
 
 
-async def add_received_file_document_using_llamacloud(
-    session: AsyncSession,
-    file_name: str,
-    llamacloud_markdown_document: str,
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """
-    Process and store document content parsed by LlamaCloud.
-
-    Args:
-        session: Database session
-        file_name: Name of the processed file
-        llamacloud_markdown_document: Markdown content from LlamaCloud parsing
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        connector: Optional connector info for Google Drive files
-
-    Returns:
-        Document object if successful, None if failed
-    """
-    try:
-        # Combine all markdown documents into one
-        file_in_markdown = llamacloud_markdown_document
-
-        # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
-        primary_hash, legacy_hash = get_google_drive_unique_identifier(
-            connector, file_name, search_space_id
-        )
-
-        # Generate content hash
-        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
-        # Check if document exists (with migration support for Google Drive and content_hash fallback)
-        existing_document = await find_existing_document_with_migration(
-            session, primary_hash, legacy_hash, content_hash
-        )
-
-        if existing_document:
-            # Handle existing document (rename detection, content change check)
-            should_skip, doc = await handle_existing_document_update(
-                session,
-                existing_document,
-                content_hash,
-                connector,
-                file_name,
-                primary_hash,
-            )
-            if should_skip:
-                return doc
-            # Content changed - continue to update
-
-        # Get user's long context LLM (needed for both create and update)
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search space {search_space_id}"
-            )
-
-        # Generate summary with metadata
-        document_metadata = {
-            "file_name": file_name,
-            "etl_service": "LLAMACLOUD",
-            "document_type": "File Document",
-        }
-        if enable_summary:
-            summary_content, summary_embedding = await generate_document_summary(
-                file_in_markdown, user_llm, document_metadata
-            )
-        else:
-            summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
-            summary_embedding = embed_text(summary_content)
-
-        # Process chunks
-        chunks = await create_document_chunks(file_in_markdown)
-
-        # Update or create document
-        if existing_document:
-            existing_document.title = file_name
-            existing_document.content = summary_content
-            existing_document.content_hash = content_hash
-            existing_document.embedding = summary_embedding
-            existing_document.document_metadata = {
-                "FILE_NAME": file_name,
-                "ETL_SERVICE": "LLAMACLOUD",
-            }
-            await safe_set_chunks(session, existing_document, chunks)
-            existing_document.source_markdown = file_in_markdown
-            existing_document.content_needs_reindexing = False
-            existing_document.updated_at = get_current_timestamp()
-            existing_document.status = DocumentStatus.ready()
-
-            await session.commit()
-            await session.refresh(existing_document)
-            document = existing_document
-        else:
-            doc_type = DocumentType.FILE
-            if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-                doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
-            document = Document(
-                search_space_id=search_space_id,
-                title=file_name,
-                document_type=doc_type,
-                document_metadata={
-                    "FILE_NAME": file_name,
-                    "ETL_SERVICE": "LLAMACLOUD",
-                },
-                content=summary_content,
-                embedding=summary_embedding,
-                chunks=chunks,
-                content_hash=content_hash,
-                unique_identifier_hash=primary_hash,
-                source_markdown=file_in_markdown,
-                content_needs_reindexing=False,
-                updated_at=get_current_timestamp(),
-                created_by_id=user_id,
-                connector_id=connector.get("connector_id") if connector else None,
-                status=DocumentStatus.ready(),
-            )
-
-            session.add(document)
-            await session.commit()
-            await session.refresh(document)
-
-        return document
-    except SQLAlchemyError as db_error:
-        await session.rollback()
-        if "ix_documents_content_hash" in str(db_error):
-            logging.warning(
-                "content_hash collision during commit for %s (LlamaCloud). Skipping.",
-                file_name,
-            )
-            return None
-        raise db_error
-    except Exception as e:
-        await session.rollback()
-        raise RuntimeError(
-            f"Failed to process file document using LlamaCloud: {e!s}"
-        ) from e
-
-
-async def add_received_file_document_using_docling(
-    session: AsyncSession,
-    file_name: str,
-    docling_markdown_document: str,
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """
-    Process and store document content parsed by Docling.
-
-    Args:
-        session: Database session
-        file_name: Name of the processed file
-        docling_markdown_document: Markdown content from Docling parsing
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        connector: Optional connector info for Google Drive files
-
-    Returns:
-        Document object if successful, None if failed
-    """
-    try:
-        file_in_markdown = docling_markdown_document
-
-        # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
-        primary_hash, legacy_hash = get_google_drive_unique_identifier(
-            connector, file_name, search_space_id
-        )
-
-        # Generate content hash
-        content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
-        # Check if document exists (with migration support for Google Drive and content_hash fallback)
-        existing_document = await find_existing_document_with_migration(
-            session, primary_hash, legacy_hash, content_hash
-        )
-
-        if existing_document:
-            # Handle existing document (rename detection, content change check)
-            should_skip, doc = await handle_existing_document_update(
-                session,
-                existing_document,
-                content_hash,
-                connector,
-                file_name,
-                primary_hash,
-            )
-            if should_skip:
-                return doc
-            # Content changed - continue to update
-
-        # Get user's long context LLM (needed for both create and update)
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
-            )
-
-        if enable_summary:
-            from app.services.docling_service import create_docling_service
-
-            docling_service = create_docling_service()
-
-            summary_content = await docling_service.process_large_document_summary(
-                content=file_in_markdown, llm=user_llm, document_title=file_name
-            )
-
-            document_metadata = {
-                "file_name": file_name,
-                "etl_service": "DOCLING",
-                "document_type": "File Document",
-            }
-            metadata_parts = ["# DOCUMENT METADATA"]
-            for key, value in document_metadata.items():
-                if value:
-                    formatted_key = key.replace("_", " ").title()
-                    metadata_parts.append(f"**{formatted_key}:** {value}")
-
-            metadata_section = "\n".join(metadata_parts)
-            enhanced_summary_content = (
-                f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
-            )
-        else:
-            enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
-
-        summary_embedding = embed_text(enhanced_summary_content)
-
-        # Process chunks
-        chunks = await create_document_chunks(file_in_markdown)
-
-        # Update or create document
-        if existing_document:
-            # Update existing document
-            existing_document.title = file_name
-            existing_document.content = enhanced_summary_content
-            existing_document.content_hash = content_hash
-            existing_document.embedding = summary_embedding
-            existing_document.document_metadata = {
-                "FILE_NAME": file_name,
-                "ETL_SERVICE": "DOCLING",
-            }
-            await safe_set_chunks(session, existing_document, chunks)
-            existing_document.source_markdown = file_in_markdown
-            existing_document.content_needs_reindexing = False
-            existing_document.updated_at = get_current_timestamp()
-            existing_document.status = DocumentStatus.ready()  # Mark as ready
-
-            await session.commit()
-            await session.refresh(existing_document)
-            document = existing_document
-        else:
-            # Create new document
-            # Determine document type based on connector
-            doc_type = DocumentType.FILE
-            if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-                doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
-            document = Document(
-                search_space_id=search_space_id,
-                title=file_name,
-                document_type=doc_type,
-                document_metadata={
-                    "FILE_NAME": file_name,
-                    "ETL_SERVICE": "DOCLING",
-                },
-                content=enhanced_summary_content,
-                embedding=summary_embedding,
-                chunks=chunks,
-                content_hash=content_hash,
-                unique_identifier_hash=primary_hash,
-                source_markdown=file_in_markdown,
-                content_needs_reindexing=False,
-                updated_at=get_current_timestamp(),
-                created_by_id=user_id,
-                connector_id=connector.get("connector_id") if connector else None,
-                status=DocumentStatus.ready(),  # Mark as ready
-            )
-
-            session.add(document)
-            await session.commit()
-            await session.refresh(document)
-
-        return document
-    except SQLAlchemyError as db_error:
-        await session.rollback()
-        if "ix_documents_content_hash" in str(db_error):
-            logging.warning(
-                "content_hash collision during commit for %s (Docling). Skipping.",
-                file_name,
-            )
-            return None
-        raise db_error
-    except Exception as e:
-        await session.rollback()
-        raise RuntimeError(
-            f"Failed to process file document using Docling: {e!s}"
-        ) from e
-
-
-async def _update_document_from_connector(
-    document: Document | None, connector: dict | None, session: AsyncSession
+async def _notify(
+    ctx: _ProcessingContext,
+    stage: str,
+    stage_message: str | None = None,
+    **kwargs,
 ) -> None:
-    """Helper to update document type, metadata, and connector_id from connector info."""
-    if document and connector:
-        if "type" in connector:
-            document.document_type = connector["type"]
-        if "metadata" in connector:
-            # Merge with existing document_metadata (the actual column name)
-            if not document.document_metadata:
-                document.document_metadata = connector["metadata"]
-            else:
-                # Expand existing metadata with connector metadata
-                merged = {**document.document_metadata, **connector["metadata"]}
-                document.document_metadata = merged
-        # Set connector_id if provided for de-indexing support
-        if "connector_id" in connector:
-            document.connector_id = connector["connector_id"]
-        await session.commit()
+    """Send a processing-progress notification if one is attached."""
+    if not ctx.notification:
+        return
+    await NotificationService.document_processing.notify_processing_progress(
+        ctx.session,
+        ctx.notification,
+        stage=stage,
+        stage_message=stage_message,
+        **kwargs,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Page-limit helpers
+# ---------------------------------------------------------------------------
+
+
+def _estimate_pages_safe(page_limit_service, file_path: str) -> int:
+    """Estimate page count with a file-size fallback."""
+    try:
+        return page_limit_service.estimate_pages_before_processing(file_path)
+    except Exception:
+        file_size = os.path.getsize(file_path)
+        return max(1, file_size // (80 * 1024))
+
+
+async def _log_page_divergence(
+    task_logger: TaskLoggingService,
+    log_entry: Log,
+    filename: str,
+    estimated: int,
+    actual: int,
+    final: int,
+) -> None:
+    """Log a warning when the actual page count far exceeds the pre-estimate."""
+    if actual > estimated * 1.5:
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Actual page count higher than estimate: {filename}",
+            {
+                "estimated_before": estimated,
+                "actual_pages": actual,
+                "using_count": final,
+            },
+        )
+
+
+# ===================================================================
+# Handlers for process_file_in_background (legacy / connector path)
+# ===================================================================
+
+
+async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
+    """Read a markdown / text file and create or update a document."""
+    await _notify(ctx, "parsing", "Reading file")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Processing markdown/text file: {ctx.filename}",
+        {"file_type": "markdown", "processing_stage": "reading_file"},
+    )
+
+    with open(ctx.file_path, encoding="utf-8") as f:
+        markdown_content = f.read()
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    await _notify(ctx, "chunking")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Creating document from markdown content: {ctx.filename}",
+        {
+            "processing_stage": "creating_document",
+            "content_length": len(markdown_content),
+        },
+    )
+
+    result = await add_received_markdown_file_document(
+        ctx.session,
+        ctx.filename,
+        markdown_content,
+        ctx.search_space_id,
+        ctx.user_id,
+        ctx.connector,
+    )
+    if ctx.connector:
+        await update_document_from_connector(result, ctx.connector, ctx.session)
+
+    if result:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully processed markdown file: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "markdown",
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Markdown file already exists (duplicate): {ctx.filename}",
+            {"duplicate_detected": True, "file_type": "markdown"},
+        )
+    return result
+
+
+async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
+    """Convert a text-based file (csv/tsv/html) to markdown without ETL."""
+    await _notify(ctx, "parsing", "Converting file")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Direct-converting file to markdown: {ctx.filename}",
+        {"file_type": "direct_convert", "processing_stage": "converting"},
+    )
+
+    markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    await _notify(ctx, "chunking")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Creating document from converted content: {ctx.filename}",
+        {
+            "processing_stage": "creating_document",
+            "content_length": len(markdown_content),
+        },
+    )
+
+    result = await add_received_markdown_file_document(
+        ctx.session,
+        ctx.filename,
+        markdown_content,
+        ctx.search_space_id,
+        ctx.user_id,
+        ctx.connector,
+    )
+    if ctx.connector:
+        await update_document_from_connector(result, ctx.connector, ctx.session)
+
+    if result:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully direct-converted file: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "direct_convert",
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Direct-converted file already exists (duplicate): {ctx.filename}",
+            {"duplicate_detected": True, "file_type": "direct_convert"},
+        )
+    return result
+
+
+async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
+    """Transcribe an audio file and create or update a document."""
+    await _notify(ctx, "parsing", "Transcribing audio")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Processing audio file for transcription: {ctx.filename}",
+        {"file_type": "audio", "processing_stage": "starting_transcription"},
+    )
+
+    stt_service_type = (
+        "local"
+        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+        else "external"
+    )
+
+    if stt_service_type == "local":
+        from app.services.stt_service import stt_service
+
+        try:
+            stt_result = stt_service.transcribe_file(ctx.file_path)
+            transcribed_text = stt_result.get("text", "")
+            if not transcribed_text:
+                raise ValueError("Transcription returned empty text")
+            transcribed_text = (
+                f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+            )
+        except Exception as e:
+            raise HTTPException(
+                status_code=422,
+                detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
+            ) from e
+
+        await ctx.task_logger.log_task_progress(
+            ctx.log_entry,
+            f"Local STT transcription completed: {ctx.filename}",
+            {
+                "processing_stage": "local_transcription_complete",
+                "language": stt_result.get("language"),
+                "confidence": stt_result.get("language_probability"),
+                "duration": stt_result.get("duration"),
+            },
+        )
+    else:
+        from litellm import atranscription
+
+        with open(ctx.file_path, "rb") as audio_file:
+            transcription_kwargs: dict = {
+                "model": app_config.STT_SERVICE,
+                "file": audio_file,
+                "api_key": app_config.STT_SERVICE_API_KEY,
+            }
+            if app_config.STT_SERVICE_API_BASE:
+                transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+
+            transcription_response = await atranscription(**transcription_kwargs)
+            transcribed_text = transcription_response.get("text", "")
+            if not transcribed_text:
+                raise ValueError("Transcription returned empty text")
+
+        transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Transcription completed, creating document: {ctx.filename}",
+        {
+            "processing_stage": "transcription_complete",
+            "transcript_length": len(transcribed_text),
+        },
+    )
+
+    await _notify(ctx, "chunking")
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    result = await add_received_markdown_file_document(
+        ctx.session,
+        ctx.filename,
+        transcribed_text,
+        ctx.search_space_id,
+        ctx.user_id,
+        ctx.connector,
+    )
+    if ctx.connector:
+        await update_document_from_connector(result, ctx.connector, ctx.session)
+
+    if result:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully transcribed and processed audio file: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "audio",
+                "transcript_length": len(transcribed_text),
+                "stt_service": stt_service_type,
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Audio file transcript already exists (duplicate): {ctx.filename}",
+            {"duplicate_detected": True, "file_type": "audio"},
+        )
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Document file processing (ETL service dispatch)
+# ---------------------------------------------------------------------------
+
+
+async def _etl_unstructured(
+    ctx: _ProcessingContext,
+    page_limit_service,
+    estimated_pages: int,
+) -> Document | None:
+    """Parse and save via the Unstructured ETL service."""
+    await _notify(ctx, "parsing", "Extracting content")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Processing file with Unstructured ETL: {ctx.filename}",
+        {
+            "file_type": "document",
+            "etl_service": "UNSTRUCTURED",
+            "processing_stage": "loading",
+        },
+    )
+
+    docs = await parse_with_unstructured(ctx.file_path)
+
+    await _notify(ctx, "chunking", chunks_count=len(docs))
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Unstructured ETL completed, creating document: {ctx.filename}",
+        {"processing_stage": "etl_complete", "elements_count": len(docs)},
+    )
+
+    actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+    final_pages = max(estimated_pages, actual_pages)
+    await _log_page_divergence(
+        ctx.task_logger,
+        ctx.log_entry,
+        ctx.filename,
+        estimated_pages,
+        actual_pages,
+        final_pages,
+    )
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    result = await add_received_file_document_using_unstructured(
+        ctx.session,
+        ctx.filename,
+        docs,
+        ctx.search_space_id,
+        ctx.user_id,
+        ctx.connector,
+        enable_summary=ctx.enable_summary,
+    )
+    if ctx.connector:
+        await update_document_from_connector(result, ctx.connector, ctx.session)
+
+    if result:
+        await page_limit_service.update_page_usage(
+            ctx.user_id, final_pages, allow_exceed=True
+        )
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully processed file with Unstructured: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "document",
+                "etl_service": "UNSTRUCTURED",
+                "pages_processed": final_pages,
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Document already exists (duplicate): {ctx.filename}",
+            {
+                "duplicate_detected": True,
+                "file_type": "document",
+                "etl_service": "UNSTRUCTURED",
+            },
+        )
+    return result
+
+
+async def _etl_llamacloud(
+    ctx: _ProcessingContext,
+    page_limit_service,
+    estimated_pages: int,
+) -> Document | None:
+    """Parse and save via the LlamaCloud ETL service."""
+    await _notify(ctx, "parsing", "Extracting content")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Processing file with LlamaCloud ETL: {ctx.filename}",
+        {
+            "file_type": "document",
+            "etl_service": "LLAMACLOUD",
+            "processing_stage": "parsing",
+            "estimated_pages": estimated_pages,
+        },
+    )
+
+    raw_result = await parse_with_llamacloud_retry(
+        file_path=ctx.file_path,
+        estimated_pages=estimated_pages,
+        task_logger=ctx.task_logger,
+        log_entry=ctx.log_entry,
+    )
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
+
+    await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
+        {
+            "processing_stage": "parsing_complete",
+            "documents_count": len(markdown_documents),
+        },
+    )
+
+    if not markdown_documents:
+        await ctx.task_logger.log_task_failure(
+            ctx.log_entry,
+            f"LlamaCloud parsing returned no documents: {ctx.filename}",
+            "ETL service returned empty document list",
+            {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
+        )
+        raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
+
+    actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
+    final_pages = max(estimated_pages, actual_pages)
+    await _log_page_divergence(
+        ctx.task_logger,
+        ctx.log_entry,
+        ctx.filename,
+        estimated_pages,
+        actual_pages,
+        final_pages,
+    )
+
+    any_created = False
+    last_doc: Document | None = None
+
+    for doc in markdown_documents:
+        doc_result = await add_received_file_document_using_llamacloud(
+            ctx.session,
+            ctx.filename,
+            llamacloud_markdown_document=doc.text,
+            search_space_id=ctx.search_space_id,
+            user_id=ctx.user_id,
+            connector=ctx.connector,
+            enable_summary=ctx.enable_summary,
+        )
+        if doc_result:
+            any_created = True
+            last_doc = doc_result
+
+    if any_created:
+        await page_limit_service.update_page_usage(
+            ctx.user_id, final_pages, allow_exceed=True
+        )
+        if ctx.connector:
+            await update_document_from_connector(last_doc, ctx.connector, ctx.session)
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully processed file with LlamaCloud: {ctx.filename}",
+            {
+                "document_id": last_doc.id,
+                "content_hash": last_doc.content_hash,
+                "file_type": "document",
+                "etl_service": "LLAMACLOUD",
+                "pages_processed": final_pages,
+                "documents_count": len(markdown_documents),
+            },
+        )
+        return last_doc
+
+    await ctx.task_logger.log_task_success(
+        ctx.log_entry,
+        f"Document already exists (duplicate): {ctx.filename}",
+        {
+            "duplicate_detected": True,
+            "file_type": "document",
+            "etl_service": "LLAMACLOUD",
+            "documents_count": len(markdown_documents),
+        },
+    )
+    return None
+
+
+async def _etl_docling(
+    ctx: _ProcessingContext,
+    page_limit_service,
+    estimated_pages: int,
+) -> Document | None:
+    """Parse and save via the Docling ETL service."""
+    await _notify(ctx, "parsing", "Extracting content")
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Processing file with Docling ETL: {ctx.filename}",
+        {
+            "file_type": "document",
+            "etl_service": "DOCLING",
+            "processing_stage": "parsing",
+        },
+    )
+
+    content = await parse_with_docling(ctx.file_path, ctx.filename)
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Docling parsing completed, creating document: {ctx.filename}",
+        {"processing_stage": "parsing_complete", "content_length": len(content)},
+    )
+
+    actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
+    final_pages = max(estimated_pages, actual_pages)
+    await _log_page_divergence(
+        ctx.task_logger,
+        ctx.log_entry,
+        ctx.filename,
+        estimated_pages,
+        actual_pages,
+        final_pages,
+    )
+
+    await _notify(ctx, "chunking")
+
+    result = await add_received_file_document_using_docling(
+        ctx.session,
+        ctx.filename,
+        docling_markdown_document=content,
+        search_space_id=ctx.search_space_id,
+        user_id=ctx.user_id,
+        connector=ctx.connector,
+        enable_summary=ctx.enable_summary,
+    )
+
+    if result:
+        await page_limit_service.update_page_usage(
+            ctx.user_id, final_pages, allow_exceed=True
+        )
+        if ctx.connector:
+            await update_document_from_connector(result, ctx.connector, ctx.session)
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully processed file with Docling: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "document",
+                "etl_service": "DOCLING",
+                "pages_processed": final_pages,
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Document already exists (duplicate): {ctx.filename}",
+            {
+                "duplicate_detected": True,
+                "file_type": "document",
+                "etl_service": "DOCLING",
+            },
+        )
+    return result
+
+
+async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
+    """Route a document file to the configured ETL service."""
+    from app.services.page_limit_service import PageLimitExceededError, PageLimitService
+
+    page_limit_service = PageLimitService(ctx.session)
+    estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path)
+
+    await ctx.task_logger.log_task_progress(
+        ctx.log_entry,
+        f"Estimated {estimated_pages} pages for file: {ctx.filename}",
+        {"estimated_pages": estimated_pages, "file_type": "document"},
+    )
+
+    try:
+        await page_limit_service.check_page_limit(ctx.user_id, estimated_pages)
+    except PageLimitExceededError as e:
+        await ctx.task_logger.log_task_failure(
+            ctx.log_entry,
+            f"Page limit exceeded before processing: {ctx.filename}",
+            str(e),
+            {
+                "error_type": "PageLimitExceeded",
+                "pages_used": e.pages_used,
+                "pages_limit": e.pages_limit,
+                "estimated_pages": estimated_pages,
+            },
+        )
+        with contextlib.suppress(Exception):
+            os.unlink(ctx.file_path)
+        raise HTTPException(status_code=403, detail=str(e)) from e
+
+    etl_dispatch = {
+        "UNSTRUCTURED": _etl_unstructured,
+        "LLAMACLOUD": _etl_llamacloud,
+        "DOCLING": _etl_docling,
+    }
+    handler = etl_dispatch.get(app_config.ETL_SERVICE)
+    if handler is None:
+        raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+
+    return await handler(ctx, page_limit_service, estimated_pages)
+
+
+# ===================================================================
+# Public orchestrators
+# ===================================================================
 
 
 async def process_file_in_background(
@@ -910,726 +690,35 @@ async def process_file_in_background(
     session: AsyncSession,
     task_logger: TaskLoggingService,
     log_entry: Log,
-    connector: dict
-    | None = None,  # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
-    notification: Notification
-    | None = None,  # Optional notification for progress updates
+    connector: dict | None = None,
+    notification: Notification | None = None,
 ) -> Document | None:
+    ctx = _ProcessingContext(
+        session=session,
+        file_path=file_path,
+        filename=filename,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        task_logger=task_logger,
+        log_entry=log_entry,
+        connector=connector,
+        notification=notification,
+    )
+
     try:
-        # Check if the file is a markdown or text file
-        if filename.lower().endswith((".md", ".markdown", ".txt")):
-            # Update notification: parsing stage
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Reading file",
-                    )
-                )
+        category = classify_file(filename)
 
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing markdown/text file: {filename}",
-                {"file_type": "markdown", "processing_stage": "reading_file"},
-            )
+        if category == FileCategory.MARKDOWN:
+            return await _process_markdown_upload(ctx)
+        if category == FileCategory.DIRECT_CONVERT:
+            return await _process_direct_convert_upload(ctx)
+        if category == FileCategory.AUDIO:
+            return await _process_audio_upload(ctx)
+        return await _process_document_upload(ctx)
 
-            # For markdown files, read the content directly
-            with open(file_path, encoding="utf-8") as f:
-                markdown_content = f.read()
-
-            # Clean up the temp file
-            import os
-
-            try:
-                os.unlink(file_path)
-            except Exception as e:
-                print("Error deleting temp file", e)
-                pass
-
-            # Update notification: chunking stage
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session, notification, stage="chunking"
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Creating document from markdown content: {filename}",
-                {
-                    "processing_stage": "creating_document",
-                    "content_length": len(markdown_content),
-                },
-            )
-
-            # Process markdown directly through specialized function
-            result = await add_received_markdown_file_document(
-                session, filename, markdown_content, search_space_id, user_id, connector
-            )
-
-            if connector:
-                await _update_document_from_connector(result, connector, session)
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully processed markdown file: {filename}",
-                    {
-                        "document_id": result.id,
-                        "content_hash": result.content_hash,
-                        "file_type": "markdown",
-                    },
-                )
-                return result
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Markdown file already exists (duplicate): {filename}",
-                    {"duplicate_detected": True, "file_type": "markdown"},
-                )
-                return None
-
-        # Check if the file is an audio file
-        elif filename.lower().endswith(
-            (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
-        ):
-            # Update notification: parsing stage (transcription)
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Transcribing audio",
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing audio file for transcription: {filename}",
-                {"file_type": "audio", "processing_stage": "starting_transcription"},
-            )
-
-            # Determine STT service type
-            stt_service_type = (
-                "local"
-                if app_config.STT_SERVICE
-                and app_config.STT_SERVICE.startswith("local/")
-                else "external"
-            )
-
-            # Check if using local STT service
-            if stt_service_type == "local":
-                # Use local Faster-Whisper for transcription
-                from app.services.stt_service import stt_service
-
-                try:
-                    result = stt_service.transcribe_file(file_path)
-                    transcribed_text = result.get("text", "")
-
-                    if not transcribed_text:
-                        raise ValueError("Transcription returned empty text")
-
-                    # Add metadata about the transcription
-                    transcribed_text = (
-                        f"# Transcription of {filename}\n\n{transcribed_text}"
-                    )
-                except Exception as e:
-                    raise HTTPException(
-                        status_code=422,
-                        detail=f"Failed to transcribe audio file {filename}: {e!s}",
-                    ) from e
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Local STT transcription completed: {filename}",
-                    {
-                        "processing_stage": "local_transcription_complete",
-                        "language": result.get("language"),
-                        "confidence": result.get("language_probability"),
-                        "duration": result.get("duration"),
-                    },
-                )
-            else:
-                # Use LiteLLM for audio transcription
-                with open(file_path, "rb") as audio_file:
-                    transcription_kwargs = {
-                        "model": app_config.STT_SERVICE,
-                        "file": audio_file,
-                        "api_key": app_config.STT_SERVICE_API_KEY,
-                    }
-                    if app_config.STT_SERVICE_API_BASE:
-                        transcription_kwargs["api_base"] = (
-                            app_config.STT_SERVICE_API_BASE
-                        )
-
-                    transcription_response = await atranscription(
-                        **transcription_kwargs
-                    )
-
-                    # Extract the transcribed text
-                    transcribed_text = transcription_response.get("text", "")
-
-                    if not transcribed_text:
-                        raise ValueError("Transcription returned empty text")
-
-                # Add metadata about the transcription
-                transcribed_text = (
-                    f"# Transcription of {filename}\n\n{transcribed_text}"
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Transcription completed, creating document: {filename}",
-                {
-                    "processing_stage": "transcription_complete",
-                    "transcript_length": len(transcribed_text),
-                },
-            )
-
-            # Update notification: chunking stage
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session, notification, stage="chunking"
-                    )
-                )
-
-            # Clean up the temp file
-            try:
-                os.unlink(file_path)
-            except Exception as e:
-                print("Error deleting temp file", e)
-                pass
-
-            # Process transcription as markdown document
-            result = await add_received_markdown_file_document(
-                session, filename, transcribed_text, search_space_id, user_id, connector
-            )
-
-            if connector:
-                await _update_document_from_connector(result, connector, session)
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully transcribed and processed audio file: {filename}",
-                    {
-                        "document_id": result.id,
-                        "content_hash": result.content_hash,
-                        "file_type": "audio",
-                        "transcript_length": len(transcribed_text),
-                        "stt_service": stt_service_type,
-                    },
-                )
-                return result
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Audio file transcript already exists (duplicate): {filename}",
-                    {"duplicate_detected": True, "file_type": "audio"},
-                )
-                return None
-
-        else:
-            # Import page limit service
-            from app.services.page_limit_service import (
-                PageLimitExceededError,
-                PageLimitService,
-            )
-
-            # Initialize page limit service
-            page_limit_service = PageLimitService(session)
-
-            # CRITICAL: Estimate page count BEFORE making expensive ETL API calls
-            # This prevents users from incurring costs on files that would exceed their limit
-            try:
-                estimated_pages_before = (
-                    page_limit_service.estimate_pages_before_processing(file_path)
-                )
-            except Exception:
-                # If estimation fails, use a conservative estimate based on file size
-                import os
-
-                file_size = os.path.getsize(file_path)
-                estimated_pages_before = max(
-                    1, file_size // (80 * 1024)
-                )  # ~80KB per page
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Estimated {estimated_pages_before} pages for file: {filename}",
-                {
-                    "estimated_pages": estimated_pages_before,
-                    "file_type": "document",
-                },
-            )
-
-            # Check page limit BEFORE calling ETL service to avoid unnecessary costs
-            try:
-                await page_limit_service.check_page_limit(
-                    user_id, estimated_pages_before
-                )
-            except PageLimitExceededError as e:
-                await task_logger.log_task_failure(
-                    log_entry,
-                    f"Page limit exceeded before processing: {filename}",
-                    str(e),
-                    {
-                        "error_type": "PageLimitExceeded",
-                        "pages_used": e.pages_used,
-                        "pages_limit": e.pages_limit,
-                        "estimated_pages": estimated_pages_before,
-                    },
-                )
-                # Clean up the temp file
-                import os
-
-                with contextlib.suppress(Exception):
-                    os.unlink(file_path)
-
-                raise HTTPException(
-                    status_code=403,
-                    detail=str(e),
-                ) from e
-
-            if app_config.ETL_SERVICE == "UNSTRUCTURED":
-                # Update notification: parsing stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Processing file with Unstructured ETL: {filename}",
-                    {
-                        "file_type": "document",
-                        "etl_service": "UNSTRUCTURED",
-                        "processing_stage": "loading",
-                    },
-                )
-
-                from langchain_unstructured import UnstructuredLoader
-
-                # Process the file
-                loader = UnstructuredLoader(
-                    file_path,
-                    mode="elements",
-                    post_processors=[],
-                    languages=["eng"],
-                    include_orig_elements=False,
-                    include_metadata=False,
-                    strategy="auto",
-                )
-
-                docs = await loader.aload()
-
-                # Update notification: chunking stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session, notification, stage="chunking", chunks_count=len(docs)
-                    )
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Unstructured ETL completed, creating document: {filename}",
-                    {"processing_stage": "etl_complete", "elements_count": len(docs)},
-                )
-
-                # Verify actual page count from parsed documents
-                actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-
-                # Use the higher of the two estimates for safety (in case pre-estimate was too low)
-                final_page_count = max(estimated_pages_before, actual_pages)
-
-                # If actual is significantly higher than estimate, log a warning
-                if actual_pages > estimated_pages_before * 1.5:
-                    await task_logger.log_task_progress(
-                        log_entry,
-                        f"Actual page count higher than estimate: {filename}",
-                        {
-                            "estimated_before": estimated_pages_before,
-                            "actual_pages": actual_pages,
-                            "using_count": final_page_count,
-                        },
-                    )
-
-                # Clean up the temp file
-                import os
-
-                try:
-                    os.unlink(file_path)
-                except Exception as e:
-                    print("Error deleting temp file", e)
-                    pass
-
-                enable_summary = (
-                    connector.get("enable_summary", True) if connector else True
-                )
-                result = await add_received_file_document_using_unstructured(
-                    session,
-                    filename,
-                    docs,
-                    search_space_id,
-                    user_id,
-                    connector,
-                    enable_summary=enable_summary,
-                )
-
-                if connector:
-                    await _update_document_from_connector(result, connector, session)
-
-                if result:
-                    # Update page usage after successful processing
-                    # allow_exceed=True because document was already created after passing initial check
-                    await page_limit_service.update_page_usage(
-                        user_id, final_page_count, allow_exceed=True
-                    )
-
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Successfully processed file with Unstructured: {filename}",
-                        {
-                            "document_id": result.id,
-                            "content_hash": result.content_hash,
-                            "file_type": "document",
-                            "etl_service": "UNSTRUCTURED",
-                            "pages_processed": final_page_count,
-                        },
-                    )
-                    return result
-                else:
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Document already exists (duplicate): {filename}",
-                        {
-                            "duplicate_detected": True,
-                            "file_type": "document",
-                            "etl_service": "UNSTRUCTURED",
-                        },
-                    )
-                    return None
-
-            elif app_config.ETL_SERVICE == "LLAMACLOUD":
-                # Update notification: parsing stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Processing file with LlamaCloud ETL: {filename}",
-                    {
-                        "file_type": "document",
-                        "etl_service": "LLAMACLOUD",
-                        "processing_stage": "parsing",
-                        "estimated_pages": estimated_pages_before,
-                    },
-                )
-
-                # Parse file with retry logic for SSL/connection errors (common with large files)
-                result = await parse_with_llamacloud_retry(
-                    file_path=file_path,
-                    estimated_pages=estimated_pages_before,
-                    task_logger=task_logger,
-                    log_entry=log_entry,
-                )
-
-                # Clean up the temp file
-                import os
-
-                try:
-                    os.unlink(file_path)
-                except Exception as e:
-                    print("Error deleting temp file", e)
-                    pass
-
-                # Get markdown documents from the result
-                markdown_documents = await result.aget_markdown_documents(
-                    split_by_page=False
-                )
-
-                # Update notification: chunking stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="chunking",
-                        chunks_count=len(markdown_documents),
-                    )
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"LlamaCloud parsing completed, creating documents: {filename}",
-                    {
-                        "processing_stage": "parsing_complete",
-                        "documents_count": len(markdown_documents),
-                    },
-                )
-
-                # Check if LlamaCloud returned any documents
-                if not markdown_documents or len(markdown_documents) == 0:
-                    await task_logger.log_task_failure(
-                        log_entry,
-                        f"LlamaCloud parsing returned no documents: {filename}",
-                        "ETL service returned empty document list",
-                        {
-                            "error_type": "EmptyDocumentList",
-                            "etl_service": "LLAMACLOUD",
-                        },
-                    )
-                    raise ValueError(
-                        f"LlamaCloud parsing returned no documents for {filename}"
-                    )
-
-                # Verify actual page count from parsed markdown documents
-                actual_pages = page_limit_service.estimate_pages_from_markdown(
-                    markdown_documents
-                )
-
-                # Use the higher of the two estimates for safety (in case pre-estimate was too low)
-                final_page_count = max(estimated_pages_before, actual_pages)
-
-                # If actual is significantly higher than estimate, log a warning
-                if actual_pages > estimated_pages_before * 1.5:
-                    await task_logger.log_task_progress(
-                        log_entry,
-                        f"Actual page count higher than estimate: {filename}",
-                        {
-                            "estimated_before": estimated_pages_before,
-                            "actual_pages": actual_pages,
-                            "using_count": final_page_count,
-                        },
-                    )
-
-                # Track if any document was successfully created (not a duplicate)
-                any_doc_created = False
-                last_created_doc = None
-
-                for doc in markdown_documents:
-                    # Extract text content from the markdown documents
-                    markdown_content = doc.text
-
-                    enable_summary = (
-                        connector.get("enable_summary", True) if connector else True
-                    )
-                    doc_result = await add_received_file_document_using_llamacloud(
-                        session,
-                        filename,
-                        llamacloud_markdown_document=markdown_content,
-                        search_space_id=search_space_id,
-                        user_id=user_id,
-                        connector=connector,
-                        enable_summary=enable_summary,
-                    )
-
-                    # Track if this document was successfully created
-                    if doc_result:
-                        any_doc_created = True
-                        last_created_doc = doc_result
-
-                # Update page usage once after processing all documents
-                # Only update if at least one document was created (not all duplicates)
-                if any_doc_created:
-                    # Update page usage after successful processing
-                    # allow_exceed=True because document was already created after passing initial check
-                    await page_limit_service.update_page_usage(
-                        user_id, final_page_count, allow_exceed=True
-                    )
-
-                    if connector:
-                        await _update_document_from_connector(
-                            last_created_doc, connector, session
-                        )
-
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Successfully processed file with LlamaCloud: {filename}",
-                        {
-                            "document_id": last_created_doc.id,
-                            "content_hash": last_created_doc.content_hash,
-                            "file_type": "document",
-                            "etl_service": "LLAMACLOUD",
-                            "pages_processed": final_page_count,
-                            "documents_count": len(markdown_documents),
-                        },
-                    )
-                    return last_created_doc
-                else:
-                    # All documents were duplicates (markdown_documents was not empty, but all returned None)
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Document already exists (duplicate): {filename}",
-                        {
-                            "duplicate_detected": True,
-                            "file_type": "document",
-                            "etl_service": "LLAMACLOUD",
-                            "documents_count": len(markdown_documents),
-                        },
-                    )
-                    return None
-
-            elif app_config.ETL_SERVICE == "DOCLING":
-                # Update notification: parsing stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Processing file with Docling ETL: {filename}",
-                    {
-                        "file_type": "document",
-                        "etl_service": "DOCLING",
-                        "processing_stage": "parsing",
-                    },
-                )
-
-                # Use Docling service for document processing
-                from app.services.docling_service import create_docling_service
-
-                # Create Docling service
-                docling_service = create_docling_service()
-
-                # Suppress pdfminer warnings that can cause processing to hang
-                # These warnings are harmless but can spam logs and potentially halt processing
-                # Suppress both Python warnings and logging warnings from pdfminer
-                pdfminer_logger = getLogger("pdfminer")
-                original_level = pdfminer_logger.level
-
-                with warnings.catch_warnings():
-                    warnings.filterwarnings(
-                        "ignore", category=UserWarning, module="pdfminer"
-                    )
-                    warnings.filterwarnings(
-                        "ignore",
-                        message=".*Cannot set gray non-stroke color.*",
-                    )
-                    warnings.filterwarnings("ignore", message=".*invalid float value.*")
-
-                    # Temporarily suppress pdfminer logging warnings
-                    pdfminer_logger.setLevel(ERROR)
-
-                    try:
-                        # Process the document
-                        result = await docling_service.process_document(
-                            file_path, filename
-                        )
-                    finally:
-                        # Restore original logging level
-                        pdfminer_logger.setLevel(original_level)
-
-                # Clean up the temp file
-                import os
-
-                try:
-                    os.unlink(file_path)
-                except Exception as e:
-                    print("Error deleting temp file", e)
-                    pass
-
-                await task_logger.log_task_progress(
-                    log_entry,
-                    f"Docling parsing completed, creating document: {filename}",
-                    {
-                        "processing_stage": "parsing_complete",
-                        "content_length": len(result["content"]),
-                    },
-                )
-
-                # Verify actual page count from content length
-                actual_pages = page_limit_service.estimate_pages_from_content_length(
-                    len(result["content"])
-                )
-
-                # Use the higher of the two estimates for safety (in case pre-estimate was too low)
-                final_page_count = max(estimated_pages_before, actual_pages)
-
-                # If actual is significantly higher than estimate, log a warning
-                if actual_pages > estimated_pages_before * 1.5:
-                    await task_logger.log_task_progress(
-                        log_entry,
-                        f"Actual page count higher than estimate: {filename}",
-                        {
-                            "estimated_before": estimated_pages_before,
-                            "actual_pages": actual_pages,
-                            "using_count": final_page_count,
-                        },
-                    )
-
-                # Update notification: chunking stage
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session, notification, stage="chunking"
-                    )
-
-                enable_summary = (
-                    connector.get("enable_summary", True) if connector else True
-                )
-                doc_result = await add_received_file_document_using_docling(
-                    session,
-                    filename,
-                    docling_markdown_document=result["content"],
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                    connector=connector,
-                    enable_summary=enable_summary,
-                )
-
-                if doc_result:
-                    # Update page usage after successful processing
-                    # allow_exceed=True because document was already created after passing initial check
-                    await page_limit_service.update_page_usage(
-                        user_id, final_page_count, allow_exceed=True
-                    )
-
-                    if connector:
-                        await _update_document_from_connector(
-                            doc_result, connector, session
-                        )
-
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Successfully processed file with Docling: {filename}",
-                        {
-                            "document_id": doc_result.id,
-                            "content_hash": doc_result.content_hash,
-                            "file_type": "document",
-                            "etl_service": "DOCLING",
-                            "pages_processed": final_page_count,
-                        },
-                    )
-                    return doc_result
-                else:
-                    await task_logger.log_task_success(
-                        log_entry,
-                        f"Document already exists (duplicate): {filename}",
-                        {
-                            "duplicate_detected": True,
-                            "file_type": "document",
-                            "etl_service": "DOCLING",
-                        },
-                    )
-                    return None
     except Exception as e:
         await session.rollback()
 
-        # For page limit errors, use the detailed message from the exception
         from app.services.page_limit_service import PageLimitExceededError
 
         if isinstance(e, PageLimitExceededError):
@@ -1645,10 +734,225 @@ async def process_file_in_background(
             str(e),
             {"error_type": type(e).__name__, "filename": filename},
         )
-        import logging
-
         logging.error(f"Error processing file in background: {error_message}")
-        raise  # Re-raise so the wrapper can also handle it
+        raise
+
+
+# ===================================================================
+# 2-phase handler (process_file_in_background_with_document)
+# ===================================================================
+
+
+async def _extract_file_content(
+    file_path: str,
+    filename: str,
+    session: AsyncSession,
+    user_id: str,
+    task_logger: TaskLoggingService,
+    log_entry: Log,
+    notification: Notification | None,
+) -> tuple[str, str]:
+    """
+    Extract markdown content from a file regardless of type.
+
+    Returns:
+        Tuple of (markdown_content, etl_service_name).
+    """
+    category = classify_file(filename)
+
+    if category == FileCategory.MARKDOWN:
+        if notification:
+            await NotificationService.document_processing.notify_processing_progress(
+                session,
+                notification,
+                stage="parsing",
+                stage_message="Reading file",
+            )
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Processing markdown/text file: {filename}",
+            {"file_type": "markdown", "processing_stage": "reading_file"},
+        )
+        with open(file_path, encoding="utf-8") as f:
+            content = f.read()
+        with contextlib.suppress(Exception):
+            os.unlink(file_path)
+        return content, "MARKDOWN"
+
+    if category == FileCategory.DIRECT_CONVERT:
+        if notification:
+            await NotificationService.document_processing.notify_processing_progress(
+                session,
+                notification,
+                stage="parsing",
+                stage_message="Converting file",
+            )
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Direct-converting file to markdown: {filename}",
+            {"file_type": "direct_convert", "processing_stage": "converting"},
+        )
+        content = convert_file_directly(file_path, filename)
+        with contextlib.suppress(Exception):
+            os.unlink(file_path)
+        return content, "DIRECT_CONVERT"
+
+    if category == FileCategory.AUDIO:
+        if notification:
+            await NotificationService.document_processing.notify_processing_progress(
+                session,
+                notification,
+                stage="parsing",
+                stage_message="Transcribing audio",
+            )
+        await task_logger.log_task_progress(
+            log_entry,
+            f"Processing audio file for transcription: {filename}",
+            {"file_type": "audio", "processing_stage": "starting_transcription"},
+        )
+        transcribed_text = await _transcribe_audio(file_path, filename)
+        with contextlib.suppress(Exception):
+            os.unlink(file_path)
+        return transcribed_text, "AUDIO_TRANSCRIPTION"
+
+    # Document file — use ETL service
+    return await _extract_document_content(
+        file_path,
+        filename,
+        session,
+        user_id,
+        task_logger,
+        log_entry,
+        notification,
+    )
+
+
+async def _transcribe_audio(file_path: str, filename: str) -> str:
+    """Transcribe an audio file and return formatted markdown text."""
+    stt_service_type = (
+        "local"
+        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+        else "external"
+    )
+
+    if stt_service_type == "local":
+        from app.services.stt_service import stt_service
+
+        result = stt_service.transcribe_file(file_path)
+        text = result.get("text", "")
+        if not text:
+            raise ValueError("Transcription returned empty text")
+    else:
+        from litellm import atranscription
+
+        with open(file_path, "rb") as audio_file:
+            kwargs: dict = {
+                "model": app_config.STT_SERVICE,
+                "file": audio_file,
+                "api_key": app_config.STT_SERVICE_API_KEY,
+            }
+            if app_config.STT_SERVICE_API_BASE:
+                kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+            response = await atranscription(**kwargs)
+            text = response.get("text", "")
+            if not text:
+                raise ValueError("Transcription returned empty text")
+
+    return f"# Transcription of {filename}\n\n{text}"
+
+
+async def _extract_document_content(
+    file_path: str,
+    filename: str,
+    session: AsyncSession,
+    user_id: str,
+    task_logger: TaskLoggingService,
+    log_entry: Log,
+    notification: Notification | None,
+) -> tuple[str, str]:
+    """
+    Parse a document file via the configured ETL service.
+
+    Returns:
+        Tuple of (markdown_content, etl_service_name).
+    """
+    from app.services.page_limit_service import PageLimitService
+
+    page_limit_service = PageLimitService(session)
+
+    try:
+        estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
+    except Exception:
+        file_size = os.path.getsize(file_path)
+        estimated_pages = max(1, file_size // (80 * 1024))
+
+    await page_limit_service.check_page_limit(user_id, estimated_pages)
+
+    etl_service = app_config.ETL_SERVICE
+    markdown_content: str | None = None
+
+    if notification:
+        await NotificationService.document_processing.notify_processing_progress(
+            session,
+            notification,
+            stage="parsing",
+            stage_message="Extracting content",
+        )
+
+    if etl_service == "UNSTRUCTURED":
+        from app.utils.document_converters import convert_document_to_markdown
+
+        docs = await parse_with_unstructured(file_path)
+        markdown_content = await convert_document_to_markdown(docs)
+        actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+        final_pages = max(estimated_pages, actual_pages)
+        await page_limit_service.update_page_usage(
+            user_id, final_pages, allow_exceed=True
+        )
+
+    elif etl_service == "LLAMACLOUD":
+        raw_result = await parse_with_llamacloud_retry(
+            file_path=file_path,
+            estimated_pages=estimated_pages,
+            task_logger=task_logger,
+            log_entry=log_entry,
+        )
+        markdown_documents = await raw_result.aget_markdown_documents(
+            split_by_page=False
+        )
+        if not markdown_documents:
+            raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
+        markdown_content = markdown_documents[0].text
+        await page_limit_service.update_page_usage(
+            user_id, estimated_pages, allow_exceed=True
+        )
+
+    elif etl_service == "DOCLING":
+        getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
+        getLogger("docling.document_converter").setLevel(ERROR)
+        getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
+            ERROR
+        )
+
+        from docling.document_converter import DocumentConverter
+
+        converter = DocumentConverter()
+        result = converter.convert(file_path)
+        markdown_content = result.document.export_to_markdown()
+        await page_limit_service.update_page_usage(
+            user_id, estimated_pages, allow_exceed=True
+        )
+
+    else:
+        raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
+
+    with contextlib.suppress(Exception):
+        os.unlink(file_path)
+
+    if not markdown_content:
+        raise RuntimeError(f"Failed to extract content from file: {filename}")
+
+    return markdown_content, etl_service
 
 
 async def process_file_in_background_with_document(
@@ -1667,272 +971,50 @@ async def process_file_in_background_with_document(
     """
     Process file and update existing pending document (2-phase pattern).
 
-    This function is Phase 2 of the real-time document status updates:
-    - Phase 1 (API): Created document with pending status
-    - Phase 2 (this): Process file and update document to ready/failed
-
-    The document already exists with pending status. This function:
-    1. Parses the file content (markdown, audio, or ETL services)
-    2. Updates the document with content, embeddings, and chunks
-    3. Sets status to 'ready' on success
-
-    Args:
-        document: Existing document with pending status
-        file_path: Path to the uploaded file
-        filename: Original filename
-        search_space_id: ID of the search space
-        user_id: ID of the user
-        session: Database session
-        task_logger: Task logging service
-        log_entry: Log entry for this task
-        connector: Optional connector info for Google Drive files
-        notification: Optional notification for progress updates
-
-    Returns:
-        Updated Document object if successful, None if duplicate content detected
+    Phase 1 (API layer): Created document with pending status.
+    Phase 2 (this function): Process file and update document to ready/failed.
     """
-    import os
-
-    from app.config import config as app_config
+    from app.indexing_pipeline.adapters.file_upload_adapter import (
+        UploadDocumentAdapter,
+    )
     from app.services.llm_service import get_user_long_context_llm
+    from app.utils.document_converters import generate_content_hash
+
+    from .base import check_duplicate_document
 
     doc_id = document.id
 
     try:
-        markdown_content = None
-        etl_service = None
-
-        # ===== STEP 1: Parse file content based on type =====
-
-        # Check if the file is a markdown or text file
-        if filename.lower().endswith((".md", ".markdown", ".txt")):
-            # Update notification: parsing stage
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Reading file",
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing markdown/text file: {filename}",
-                {"file_type": "markdown", "processing_stage": "reading_file"},
-            )
-
-            # Read markdown content directly
-            with open(file_path, encoding="utf-8") as f:
-                markdown_content = f.read()
-            etl_service = "MARKDOWN"
-
-            # Clean up temp file
-            with contextlib.suppress(Exception):
-                os.unlink(file_path)
-
-        # Check if the file is an audio file
-        elif filename.lower().endswith(
-            (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
-        ):
-            # Update notification: parsing stage (transcription)
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Transcribing audio",
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing audio file for transcription: {filename}",
-                {"file_type": "audio", "processing_stage": "starting_transcription"},
-            )
-
-            # Transcribe audio
-            stt_service_type = (
-                "local"
-                if app_config.STT_SERVICE
-                and app_config.STT_SERVICE.startswith("local/")
-                else "external"
-            )
-
-            if stt_service_type == "local":
-                from app.services.stt_service import stt_service
-
-                result = stt_service.transcribe_file(file_path)
-                transcribed_text = result.get("text", "")
-                if not transcribed_text:
-                    raise ValueError("Transcription returned empty text")
-                markdown_content = (
-                    f"# Transcription of {filename}\n\n{transcribed_text}"
-                )
-            else:
-                with open(file_path, "rb") as audio_file:
-                    transcription_kwargs = {
-                        "model": app_config.STT_SERVICE,
-                        "file": audio_file,
-                        "api_key": app_config.STT_SERVICE_API_KEY,
-                    }
-                    if app_config.STT_SERVICE_API_BASE:
-                        transcription_kwargs["api_base"] = (
-                            app_config.STT_SERVICE_API_BASE
-                        )
-                    transcription_response = await atranscription(
-                        **transcription_kwargs
-                    )
-                    transcribed_text = transcription_response.get("text", "")
-                    if not transcribed_text:
-                        raise ValueError("Transcription returned empty text")
-                markdown_content = (
-                    f"# Transcription of {filename}\n\n{transcribed_text}"
-                )
-
-            etl_service = "AUDIO_TRANSCRIPTION"
-            # Clean up temp file
-            with contextlib.suppress(Exception):
-                os.unlink(file_path)
-
-        else:
-            # Document files - use ETL service
-            from app.services.page_limit_service import (
-                PageLimitExceededError,
-                PageLimitService,
-            )
-
-            page_limit_service = PageLimitService(session)
-
-            # Estimate page count
-            try:
-                estimated_pages = page_limit_service.estimate_pages_before_processing(
-                    file_path
-                )
-            except Exception:
-                file_size = os.path.getsize(file_path)
-                estimated_pages = max(1, file_size // (80 * 1024))
-
-            # Check page limit
-            await page_limit_service.check_page_limit(user_id, estimated_pages)
-
-            if app_config.ETL_SERVICE == "UNSTRUCTURED":
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                from langchain_unstructured import UnstructuredLoader
-
-                loader = UnstructuredLoader(
-                    file_path,
-                    mode="elements",
-                    post_processors=[],
-                    languages=["eng"],
-                    include_orig_elements=False,
-                    include_metadata=False,
-                    strategy="auto",
-                )
-                docs = await loader.aload()
-                markdown_content = await convert_document_to_markdown(docs)
-                actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-                final_page_count = max(estimated_pages, actual_pages)
-                etl_service = "UNSTRUCTURED"
-
-                # Update page usage
-                await page_limit_service.update_page_usage(
-                    user_id, final_page_count, allow_exceed=True
-                )
-
-            elif app_config.ETL_SERVICE == "LLAMACLOUD":
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                result = await parse_with_llamacloud_retry(
-                    file_path=file_path,
-                    estimated_pages=estimated_pages,
-                    task_logger=task_logger,
-                    log_entry=log_entry,
-                )
-                markdown_documents = await result.aget_markdown_documents(
-                    split_by_page=False
-                )
-                if not markdown_documents:
-                    raise RuntimeError(
-                        f"LlamaCloud parsing returned no documents: {filename}"
-                    )
-                markdown_content = markdown_documents[0].text
-                etl_service = "LLAMACLOUD"
-
-                # Update page usage
-                await page_limit_service.update_page_usage(
-                    user_id, estimated_pages, allow_exceed=True
-                )
-
-            elif app_config.ETL_SERVICE == "DOCLING":
-                if notification:
-                    await NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Extracting content",
-                    )
-
-                # Suppress logging during Docling import
-                getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
-                getLogger("docling.document_converter").setLevel(ERROR)
-                getLogger(
-                    "docling_core.transforms.chunker.hierarchical_chunker"
-                ).setLevel(ERROR)
-
-                from docling.document_converter import DocumentConverter
-
-                converter = DocumentConverter()
-                result = converter.convert(file_path)
-                markdown_content = result.document.export_to_markdown()
-                etl_service = "DOCLING"
-
-                # Update page usage
-                await page_limit_service.update_page_usage(
-                    user_id, estimated_pages, allow_exceed=True
-                )
-
-            else:
-                raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
-
-            # Clean up temp file
-            with contextlib.suppress(Exception):
-                os.unlink(file_path)
+        # Step 1: extract content
+        markdown_content, etl_service = await _extract_file_content(
+            file_path,
+            filename,
+            session,
+            user_id,
+            task_logger,
+            log_entry,
+            notification,
+        )
 
         if not markdown_content:
             raise RuntimeError(f"Failed to extract content from file: {filename}")
 
-        # ===== STEP 2: Check for duplicate content =====
+        # Step 2: duplicate check
         content_hash = generate_content_hash(markdown_content, search_space_id)
-
         existing_by_content = await check_duplicate_document(session, content_hash)
         if existing_by_content and existing_by_content.id != doc_id:
-            # Duplicate content found - mark this document as failed
             logging.info(
                 f"Duplicate content detected for {filename}, "
                 f"matches document {existing_by_content.id}"
             )
             return None
 
-        # ===== STEP 3+4: Index via pipeline =====
+        # Step 3: index via pipeline
         if notification:
             await NotificationService.document_processing.notify_processing_progress(
-                session, notification, stage="chunking"
+                session,
+                notification,
+                stage="chunking",
             )
 
         user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@@ -1957,7 +1039,6 @@ async def process_file_in_background_with_document(
                 "file_type": etl_service,
             },
         )
-
         return document
 
     except Exception as e:
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index 2fb711bf8..0ff340c0e 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -14,88 +14,19 @@ from app.utils.document_converters import (
     create_document_chunks,
     generate_content_hash,
     generate_document_summary,
-    generate_unique_identifier_hash,
 )
 
+from ._helpers import (
+    find_existing_document_with_migration,
+    get_google_drive_unique_identifier,
+)
 from .base import (
-    check_document_by_unique_identifier,
     check_duplicate_document,
     get_current_timestamp,
     safe_set_chunks,
 )
 
 
-def _get_google_drive_unique_identifier(
-    connector: dict | None,
-    filename: str,
-    search_space_id: int,
-) -> tuple[str, str | None]:
-    """
-    Get unique identifier hash for a file, with special handling for Google Drive.
-
-    For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
-    For other files, uses filename.
-
-    Args:
-        connector: Optional connector info dict with type and metadata
-        filename: The filename (used for non-Google Drive files or as fallback)
-        search_space_id: The search space ID
-
-    Returns:
-        Tuple of (primary_hash, legacy_hash or None)
-    """
-    if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
-        metadata = connector.get("metadata", {})
-        file_id = metadata.get("google_drive_file_id")
-
-        if file_id:
-            primary_hash = generate_unique_identifier_hash(
-                DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
-            )
-            legacy_hash = generate_unique_identifier_hash(
-                DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
-            )
-            return primary_hash, legacy_hash
-
-    primary_hash = generate_unique_identifier_hash(
-        DocumentType.FILE, filename, search_space_id
-    )
-    return primary_hash, None
-
-
-async def _find_existing_document_with_migration(
-    session: AsyncSession,
-    primary_hash: str,
-    legacy_hash: str | None,
-    content_hash: str | None = None,
-) -> Document | None:
-    """
-    Find existing document, checking both new hash and legacy hash for migration,
-    with fallback to content_hash for cross-source deduplication.
-    """
-    existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
-    if not existing_document and legacy_hash:
-        existing_document = await check_document_by_unique_identifier(
-            session, legacy_hash
-        )
-        if existing_document:
-            logging.info(
-                "Found legacy document (filename-based hash), will migrate to file_id-based hash"
-            )
-
-    # Fallback: check by content_hash to catch duplicates from different sources
-    if not existing_document and content_hash:
-        existing_document = await check_duplicate_document(session, content_hash)
-        if existing_document:
-            logging.info(
-                f"Found duplicate content from different source (content_hash match). "
-                f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
-            )
-
-    return existing_document
-
-
 async def _handle_existing_document_update(
     session: AsyncSession,
     existing_document: Document,
@@ -224,7 +155,7 @@ async def add_received_markdown_file_document(
 
     try:
         # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
-        primary_hash, legacy_hash = _get_google_drive_unique_identifier(
+        primary_hash, legacy_hash = get_google_drive_unique_identifier(
             connector, file_name, search_space_id
         )
 
@@ -232,7 +163,7 @@ async def add_received_markdown_file_document(
         content_hash = generate_content_hash(file_in_markdown, search_space_id)
 
         # Check if document exists (with migration support for Google Drive and content_hash fallback)
-        existing_document = await _find_existing_document_with_migration(
+        existing_document = await find_existing_document_with_migration(
             session, primary_hash, legacy_hash, content_hash
         )
 
diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
index a8dab43f0..a56398baa 100644
--- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
+++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
@@ -2,12 +2,11 @@
 Integration tests for backend file upload limit enforcement.
 
 These tests verify that the API rejects uploads that exceed:
-  - Max files per upload (10)
-  - Max per-file size (50 MB)
-  - Max total upload size (200 MB)
+  - Max per-file size (500 MB)
 
-The limits mirror the frontend's DocumentUploadTab.tsx constants and are
-enforced server-side to protect against direct API calls.
+No file count or total size limits are enforced — the frontend batches
+uploads in groups of 5 and there is no cap on how many files a user can
+upload in a single session.
 
 Prerequisites:
   - PostgreSQL + pgvector
@@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration
 
 
 # ---------------------------------------------------------------------------
-# Test A: File count limit
-# ---------------------------------------------------------------------------
-
-
-class TestFileCountLimit:
-    """Uploading more than 10 files in a single request should be rejected."""
-
-    async def test_11_files_returns_413(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-    ):
-        files = [
-            ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
-            for i in range(11)
-        ]
-        resp = await client.post(
-            "/api/v1/documents/fileupload",
-            headers=headers,
-            files=files,
-            data={"search_space_id": str(search_space_id)},
-        )
-        assert resp.status_code == 413
-        assert "too many files" in resp.json()["detail"].lower()
-
-    async def test_10_files_accepted(
-        self,
-        client: httpx.AsyncClient,
-        headers: dict[str, str],
-        search_space_id: int,
-        cleanup_doc_ids: list[int],
-    ):
-        files = [
-            ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
-            for i in range(10)
-        ]
-        resp = await client.post(
-            "/api/v1/documents/fileupload",
-            headers=headers,
-            files=files,
-            data={"search_space_id": str(search_space_id)},
-        )
-        assert resp.status_code == 200
-        cleanup_doc_ids.extend(resp.json().get("document_ids", []))
-
-
-# ---------------------------------------------------------------------------
-# Test B: Per-file size limit
+# Test: Per-file size limit (500 MB)
 # ---------------------------------------------------------------------------
 
 
 class TestPerFileSizeLimit:
-    """A single file exceeding 50 MB should be rejected."""
+    """A single file exceeding 500 MB should be rejected."""
 
     async def test_oversized_file_returns_413(
         self,
@@ -85,7 +36,7 @@ class TestPerFileSizeLimit:
         headers: dict[str, str],
         search_space_id: int,
     ):
-        oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1))
+        oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1))
         resp = await client.post(
             "/api/v1/documents/fileupload",
             headers=headers,
@@ -102,11 +53,11 @@ class TestPerFileSizeLimit:
         search_space_id: int,
         cleanup_doc_ids: list[int],
     ):
-        at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024))
+        at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024))
         resp = await client.post(
             "/api/v1/documents/fileupload",
             headers=headers,
-            files=[("files", ("exact50mb.txt", at_limit, "text/plain"))],
+            files=[("files", ("exact500mb.txt", at_limit, "text/plain"))],
             data={"search_space_id": str(search_space_id)},
         )
         assert resp.status_code == 200
@@ -114,26 +65,23 @@ class TestPerFileSizeLimit:
 
 
 # ---------------------------------------------------------------------------
-# Test C: Total upload size limit
+# Test: Multiple files accepted without count limit
 # ---------------------------------------------------------------------------
 
 
-class TestTotalSizeLimit:
-    """Multiple files whose combined size exceeds 200 MB should be rejected."""
+class TestNoFileCountLimit:
+    """Many files in a single request should be accepted."""
 
-    async def test_total_size_over_200mb_returns_413(
+    async def test_many_files_accepted(
         self,
         client: httpx.AsyncClient,
         headers: dict[str, str],
         search_space_id: int,
+        cleanup_doc_ids: list[int],
     ):
-        chunk_size = 45 * 1024 * 1024  # 45 MB each
         files = [
-            (
-                "files",
-                (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"),
-            )
-            for i in range(5)  # 5 x 45 MB = 225 MB > 200 MB
+            ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
+            for i in range(20)
         ]
         resp = await client.post(
             "/api/v1/documents/fileupload",
@@ -141,5 +89,5 @@ class TestTotalSizeLimit:
             files=files,
             data={"search_space_id": str(search_space_id)},
         )
-        assert resp.status_code == 413
-        assert "total upload size" in resp.json()["detail"].lower()
+        assert resp.status_code == 200
+        cleanup_doc_ids.extend(resp.json().get("document_ids", []))
diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
index 163dd0d1d..a8cf5c93b 100644
--- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
+++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
@@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
             return []
 
         async def fake_build_scoped_filesystem(**kwargs):
-            return {}
+            return {}, {}
 
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
             return []
 
         async def fake_build_scoped_filesystem(**kwargs):
-            return {}
+            return {}, {}
 
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
             return []
 
         async def fake_build_scoped_filesystem(**kwargs):
-            return {}
+            return {}, {}
 
         monkeypatch.setattr(
             "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
index 4e0c36267..1c246ed71 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
@@ -329,14 +329,15 @@ export function DocumentsTableShell({
 
 	const handleViewDocument = useCallback(async (doc: Document) => {
 		setViewingDoc(doc);
-		if (doc.content) {
-			setViewingContent(doc.content);
+		const preview = doc.content_preview || doc.content;
+		if (preview) {
+			setViewingContent(preview);
 			return;
 		}
 		setViewingLoading(true);
 		try {
 			const fullDoc = await documentsApiService.getDocument({ id: doc.id });
-			setViewingContent(fullDoc.content);
+			setViewingContent(fullDoc.content_preview || fullDoc.content);
 		} catch (err) {
 			console.error("[DocumentsTableShell] Failed to fetch document content:", err);
 			setViewingContent("Failed to load document content.");
@@ -946,13 +947,36 @@ export function DocumentsTableShell({
 							WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`,
 						}}
 					>
-						{viewingLoading ? (
-							<div className="flex items-center justify-center py-12">
-								<Spinner size="lg" className="text-muted-foreground" />
-							</div>
-						) : (
-							<MarkdownViewer content={viewingContent} />
-						)}
+					{viewingLoading ? (
+						<div className="flex items-center justify-center py-12">
+							<Spinner size="lg" className="text-muted-foreground" />
+						</div>
+					) : (
+						<>
+							<MarkdownViewer content={viewingContent} maxLength={50_000} />
+							{viewingDoc && (
+								<div className="mt-4 flex justify-center">
+									<Button
+										variant="outline"
+										size="sm"
+										onClick={() => {
+											if (viewingDoc) {
+												openEditor({
+													documentId: viewingDoc.id,
+													searchSpaceId: Number(searchSpaceId),
+													title: viewingDoc.title,
+												});
+												handleCloseViewer();
+											}
+										}}
+									>
+										<Eye className="h-3.5 w-3.5 mr-1.5" />
+										View full document
+									</Button>
+								</div>
+							)}
+						</>
+					)}
 					</div>
 				</DrawerContent>
 			</Drawer>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
index d87f7374b..88914bd4f 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
@@ -9,9 +9,9 @@ export type Document = {
 	id: number;
 	title: string;
 	document_type: DocumentType;
-	// Optional: Only needed when viewing document details (lazy loaded)
 	document_metadata?: any;
 	content?: string;
+	content_preview?: string;
 	created_at: string;
 	search_space_id: number;
 	created_by_id?: string | null;
diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx
index 3ea36f800..4b7079aef 100644
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@@ -1,12 +1,13 @@
 "use client";
 
 import { useAtomValue, useSetAtom } from "jotai";
-import { AlertCircle, XIcon } from "lucide-react";
+import { AlertCircle, Download, FileText, Loader2, XIcon } from "lucide-react";
 import dynamic from "next/dynamic";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
 import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { MarkdownViewer } from "@/components/markdown-viewer";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
 import { Skeleton } from "@/components/ui/skeleton";
@@ -18,11 +19,16 @@ const PlateEditor = dynamic(
 	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
 );
 
+const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
+
 interface EditorContent {
 	document_id: number;
 	title: string;
 	document_type?: string;
 	source_markdown: string;
+	content_size_bytes?: number;
+	chunk_count?: number;
+	truncated?: boolean;
 }
 
 const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]);
@@ -62,6 +68,7 @@ export function EditorPanelContent({
 	const [isLoading, setIsLoading] = useState(true);
 	const [error, setError] = useState<string | null>(null);
 	const [saving, setSaving] = useState(false);
+	const [downloading, setDownloading] = useState(false);
 
 	const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
 	const markdownRef = useRef<string>("");
@@ -69,6 +76,8 @@ export function EditorPanelContent({
 	const changeCountRef = useRef(0);
 	const [displayTitle, setDisplayTitle] = useState(title || "Untitled");
 
+	const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
+
 	useEffect(() => {
 		let cancelled = false;
 		setIsLoading(true);
@@ -86,10 +95,12 @@ export function EditorPanelContent({
 			}
 
 			try {
-				const response = await authenticatedFetch(
-					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
-					{ method: "GET" }
+				const url = new URL(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
 				);
+				url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
+
+				const response = await authenticatedFetch(url.toString(), { method: "GET" });
 
 				if (cancelled) return;
 
@@ -175,7 +186,7 @@ export function EditorPanelContent({
 	}, [documentId, searchSpaceId]);
 
 	const isEditableType = editorDoc
-		? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "")
+		? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument
 		: false;
 
 	return (
@@ -206,6 +217,57 @@ export function EditorPanelContent({
 							<p className="text-sm text-red-500 mt-1">{error || "An unknown error occurred"}</p>
 						</div>
 					</div>
+				) : isLargeDocument ? (
+					<div className="h-full overflow-y-auto px-5 py-4">
+						<Alert className="mb-4">
+							<FileText className="size-4" />
+							<AlertDescription className="flex items-center justify-between gap-4">
+								<span>
+									This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
+								</span>
+								<Button
+									variant="outline"
+									size="sm"
+									className="shrink-0 gap-1.5"
+									disabled={downloading}
+									onClick={async () => {
+										setDownloading(true);
+										try {
+											const response = await authenticatedFetch(
+												`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
+												{ method: "GET" }
+											);
+											if (!response.ok) throw new Error("Download failed");
+											const blob = await response.blob();
+											const url = URL.createObjectURL(blob);
+											const a = document.createElement("a");
+											a.href = url;
+											const disposition = response.headers.get("content-disposition");
+											const match = disposition?.match(/filename="(.+)"/);
+											a.download = match?.[1] ?? `${editorDoc.title || "document"}.md`;
+											document.body.appendChild(a);
+											a.click();
+											a.remove();
+											URL.revokeObjectURL(url);
+											toast.success("Download started");
+										} catch {
+											toast.error("Failed to download document");
+										} finally {
+											setDownloading(false);
+										}
+									}}
+								>
+									{downloading ? (
+										<Loader2 className="size-3.5 animate-spin" />
+									) : (
+										<Download className="size-3.5" />
+									)}
+									{downloading ? "Preparing..." : "Download .md"}
+								</Button>
+							</AlertDescription>
+						</Alert>
+						<MarkdownViewer content={editorDoc.source_markdown} />
+					</div>
 				) : isEditableType ? (
 					<PlateEditor
 						key={documentId}
diff --git a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
index ac279cd4d..ad48c89de 100644
--- a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
+++ b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx
@@ -1,18 +1,24 @@
 "use client";
 
-import { AlertCircle, Pencil } from "lucide-react";
+import { AlertCircle, Download, FileText, Loader2, Pencil } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { toast } from "sonner";
 import { PlateEditor } from "@/components/editor/plate-editor";
 import { MarkdownViewer } from "@/components/markdown-viewer";
+import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
 
+const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
+
 interface DocumentContent {
 	document_id: number;
 	title: string;
 	document_type?: string;
 	source_markdown: string;
+	content_size_bytes?: number;
+	chunk_count?: number;
+	truncated?: boolean;
 }
 
 function DocumentSkeleton() {
@@ -49,11 +55,14 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 	const [error, setError] = useState<string | null>(null);
 	const [isEditing, setIsEditing] = useState(false);
 	const [saving, setSaving] = useState(false);
+	const [downloading, setDownloading] = useState(false);
 	const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
 	const markdownRef = useRef<string>("");
 	const initialLoadDone = useRef(false);
 	const changeCountRef = useRef(0);
 
+	const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
+
 	useEffect(() => {
 		let cancelled = false;
 		setIsLoading(true);
@@ -72,10 +81,12 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 			}
 
 			try {
-				const response = await authenticatedFetch(
-					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
-					{ method: "GET" }
+				const url = new URL(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
 				);
+				url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
+
+				const response = await authenticatedFetch(url.toString(), { method: "GET" });
 
 				if (cancelled) return;
 
@@ -173,9 +184,9 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 		);
 	}
 
-	const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "");
+	const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument;
 
-	if (isEditing) {
+	if (isEditing && !isLargeDocument) {
 		return (
 			<div className="flex flex-col h-full overflow-hidden">
 				<div className="flex items-center justify-between px-6 py-3 border-b shrink-0">
@@ -236,7 +247,60 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
 			</div>
 			<div className="flex-1 overflow-auto">
 				<div className="max-w-4xl mx-auto px-6 py-6">
-					<MarkdownViewer content={doc.source_markdown} />
+					{isLargeDocument ? (
+						<>
+							<Alert className="mb-4">
+								<FileText className="size-4" />
+								<AlertDescription className="flex items-center justify-between gap-4">
+									<span>
+										This document is too large for the editor ({Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {doc.chunk_count ?? 0} chunks). Showing a preview below.
+									</span>
+									<Button
+										variant="outline"
+										size="sm"
+										className="shrink-0 gap-1.5"
+										disabled={downloading}
+										onClick={async () => {
+											setDownloading(true);
+											try {
+												const response = await authenticatedFetch(
+													`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
+													{ method: "GET" }
+												);
+												if (!response.ok) throw new Error("Download failed");
+												const blob = await response.blob();
+												const url = URL.createObjectURL(blob);
+												const a = document.createElement("a");
+												a.href = url;
+												const disposition = response.headers.get("content-disposition");
+												const match = disposition?.match(/filename="(.+)"/);
+												a.download = match?.[1] ?? `${doc.title || "document"}.md`;
+												document.body.appendChild(a);
+												a.click();
+												a.remove();
+												URL.revokeObjectURL(url);
+												toast.success("Download started");
+											} catch {
+												toast.error("Failed to download document");
+											} finally {
+												setDownloading(false);
+											}
+										}}
+									>
+										{downloading ? (
+											<Loader2 className="size-3.5 animate-spin" />
+										) : (
+											<Download className="size-3.5" />
+										)}
+										{downloading ? "Preparing..." : "Download .md"}
+									</Button>
+								</AlertDescription>
+							</Alert>
+							<MarkdownViewer content={doc.source_markdown} />
+						</>
+					) : (
+						<MarkdownViewer content={doc.source_markdown} />
+					)}
 				</div>
 			</div>
 		</div>
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx
index e22df8998..abd999301 100644
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@@ -15,6 +15,7 @@ const math = createMathPlugin({
 interface MarkdownViewerProps {
 	content: string;
 	className?: string;
+	maxLength?: number;
 }
 
 /**
@@ -79,8 +80,10 @@ function convertLatexDelimiters(content: string): string {
 	return content;
 }
 
-export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
-	const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(content));
+export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) {
+	const isTruncated = maxLength != null && content.length > maxLength;
+	const displayContent = isTruncated ? content.slice(0, maxLength) : content;
+	const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent));
 	const components: StreamdownProps["components"] = {
 		p: ({ children, ...props }) => (
 			<p className="my-2" {...props}>
@@ -171,6 +174,11 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
 			>
 				{processedContent}
 			</Streamdown>
+			{isTruncated && (
+				<p className="mt-4 text-sm text-muted-foreground italic">
+					Content truncated ({Math.round(content.length / 1024)}KB total). Showing first {Math.round(maxLength / 1024)}KB.
+				</p>
+			)}
 		</div>
 	);
 }
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx
index b02b2e217..c17616c53 100644
--- a/surfsense_web/components/new-chat/source-detail-panel.tsx
+++ b/surfsense_web/components/new-chat/source-detail-panel.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useQuery } from "@tanstack/react-query";
-import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react";
+import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react";
 import { AnimatePresence, motion, useReducedMotion } from "motion/react";
 import { useTranslations } from "next-intl";
 import type React from "react";
@@ -10,7 +10,6 @@ import { createPortal } from "react-dom";
 import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Badge } from "@/components/ui/badge";
 import { Button } from "@/components/ui/button";
-import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
 import { ScrollArea } from "@/components/ui/scroll-area";
 import { Spinner } from "@/components/ui/spinner";
 import type {
@@ -48,7 +47,8 @@ const formatDocumentType = (type: string) => {
 // which break auto-scroll functionality
 interface ChunkCardProps {
 	chunk: { id: number; content: string };
-	index: number;
+	localIndex: number;
+	chunkNumber: number;
 	totalChunks: number;
 	isCited: boolean;
 	isActive: boolean;
@@ -56,11 +56,11 @@ interface ChunkCardProps {
 }
 
 const ChunkCard = memo(
-	forwardRef<HTMLDivElement, ChunkCardProps>(({ chunk, index, totalChunks, isCited }, ref) => {
+	forwardRef<HTMLDivElement, ChunkCardProps>(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => {
 		return (
 			<div
 				ref={ref}
-				data-chunk-index={index}
+				data-chunk-index={localIndex}
 				className={cn(
 					"group relative rounded-2xl border-2 transition-all duration-300",
 					isCited
@@ -68,10 +68,8 @@ const ChunkCard = memo(
 						: "bg-card border-border/50 hover:border-border hover:shadow-md"
 				)}
 			>
-				{/* Cited indicator glow effect */}
 				{isCited && <div className="absolute inset-0 rounded-2xl bg-primary/5 blur-xl -z-10" />}
 
-				{/* Header */}
 				<div className="flex items-center justify-between px-5 py-4 border-b border-border/50">
 					<div className="flex items-center gap-3">
 						<div
@@ -82,9 +80,9 @@ const ChunkCard = memo(
 									: "bg-muted text-muted-foreground group-hover:bg-muted/80"
 							)}
 						>
-							{index + 1}
+							{chunkNumber}
 						</div>
-						<span className="text-sm text-muted-foreground">of {totalChunks} chunks</span>
+						<span className="text-sm text-muted-foreground">Chunk {chunkNumber} of {totalChunks}</span>
 					</div>
 					{isCited && (
 						<Badge variant="default" className="gap-1.5 px-3 py-1">
@@ -94,9 +92,8 @@ const ChunkCard = memo(
 					)}
 				</div>
 
-				{/* Content */}
 				<div className="p-5 overflow-hidden">
-					<MarkdownViewer content={chunk.content} />
+					<MarkdownViewer content={chunk.content} maxLength={100_000} />
 				</div>
 			</div>
 		);
@@ -118,7 +115,6 @@ export function SourceDetailPanel({
 	const t = useTranslations("dashboard");
 	const scrollAreaRef = useRef<HTMLDivElement>(null);
 	const hasScrolledRef = useRef(false); // Use ref to avoid stale closures
-	const [summaryOpen, setSummaryOpen] = useState(false);
 	const [activeChunkIndex, setActiveChunkIndex] = useState<number | null>(null);
 	const [mounted, setMounted] = useState(false);
 	const [_hasScrolledToCited, setHasScrolledToCited] = useState(false);
@@ -140,20 +136,88 @@ export function SourceDetailPanel({
 			if (isDocsChunk) {
 				return documentsApiService.getSurfsenseDocByChunk(chunkId);
 			}
-			return documentsApiService.getDocumentByChunk({ chunk_id: chunkId });
+			return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 });
 		},
 		enabled: !!chunkId && open,
 		staleTime: 5 * 60 * 1000,
 	});
 
+	const totalChunks = (documentData && "total_chunks" in documentData)
+		? (documentData.total_chunks ?? documentData.chunks.length)
+		: (documentData?.chunks?.length ?? 0);
+	const [beforeChunks, setBeforeChunks] = useState<Array<{ id: number; content: string; created_at: string }>>([]);
+	const [afterChunks, setAfterChunks] = useState<Array<{ id: number; content: string; created_at: string }>>([]);
+	const [loadingBefore, setLoadingBefore] = useState(false);
+	const [loadingAfter, setLoadingAfter] = useState(false);
+
+	useEffect(() => {
+		setBeforeChunks([]);
+		setAfterChunks([]);
+	}, [chunkId, open]);
+
+	const chunkStartIndex = (documentData && "chunk_start_index" in documentData)
+		? (documentData.chunk_start_index ?? 0) : 0;
+	const initialChunks = documentData?.chunks ?? [];
+	const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks];
+	const absoluteStart = chunkStartIndex - beforeChunks.length;
+	const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length;
+	const canLoadBefore = absoluteStart > 0;
+	const canLoadAfter = absoluteEnd < totalChunks;
+
+	const EXPAND_SIZE = 10;
+
+	const loadBefore = useCallback(async () => {
+		if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return;
+		setLoadingBefore(true);
+		try {
+			const count = Math.min(EXPAND_SIZE, absoluteStart);
+			const result = await documentsApiService.getDocumentChunks({
+				document_id: documentData.id,
+				page: 0,
+				page_size: count,
+				start_offset: absoluteStart - count,
+			});
+			const existingIds = new Set(allChunks.map(c => c.id));
+			const newChunks = result.items
+				.filter(c => !existingIds.has(c.id))
+				.map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
+			setBeforeChunks(prev => [...newChunks, ...prev]);
+		} catch (err) {
+			console.error("Failed to load earlier chunks:", err);
+		} finally {
+			setLoadingBefore(false);
+		}
+	}, [documentData, absoluteStart, canLoadBefore, allChunks]);
+
+	const loadAfter = useCallback(async () => {
+		if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return;
+		setLoadingAfter(true);
+		try {
+			const result = await documentsApiService.getDocumentChunks({
+				document_id: documentData.id,
+				page: 0,
+				page_size: EXPAND_SIZE,
+				start_offset: absoluteEnd,
+			});
+			const existingIds = new Set(allChunks.map(c => c.id));
+			const newChunks = result.items
+				.filter(c => !existingIds.has(c.id))
+				.map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
+			setAfterChunks(prev => [...prev, ...newChunks]);
+		} catch (err) {
+			console.error("Failed to load later chunks:", err);
+		} finally {
+			setLoadingAfter(false);
+		}
+	}, [documentData, absoluteEnd, canLoadAfter, allChunks]);
+
 	const isDirectRenderSource =
 		sourceType === "TAVILY_API" ||
 		sourceType === "LINKUP_API" ||
 		sourceType === "SEARXNG_API" ||
 		sourceType === "BAIDU_SEARCH_API";
 
-	// Find cited chunk index
-	const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1;
+	const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId);
 
 	// Simple scroll function that scrolls to a chunk by index
 	const scrollToChunkByIndex = useCallback(
@@ -336,12 +400,12 @@ export function SourceDetailPanel({
 									{documentData && "document_type" in documentData
 										? formatDocumentType(documentData.document_type)
 										: sourceType && formatDocumentType(sourceType)}
-									{documentData?.chunks && (
-										<span className="ml-2">
-											• {documentData.chunks.length} chunk
-											{documentData.chunks.length !== 1 ? "s" : ""}
-										</span>
-									)}
+								{totalChunks > 0 && (
+									<span className="ml-2">
+										• {totalChunks} chunk{totalChunks !== 1 ? "s" : ""}
+										{allChunks.length < totalChunks && ` (showing ${allChunks.length})`}
+									</span>
+								)}
 								</p>
 							</div>
 							<div className="flex items-center gap-3 shrink-0">
@@ -450,7 +514,7 @@ export function SourceDetailPanel({
 						{!isDirectRenderSource && documentData && (
 							<div className="flex-1 flex overflow-hidden">
 								{/* Chunk Navigation Sidebar */}
-								{documentData.chunks.length > 1 && (
+								{allChunks.length > 1 && (
 									<motion.div
 										initial={{ opacity: 0, x: -20 }}
 										animate={{ opacity: 1, x: 0 }}
@@ -459,7 +523,8 @@ export function SourceDetailPanel({
 									>
 										<ScrollArea className="flex-1 h-full">
 											<div className="p-2 pt-3 flex flex-col gap-1.5">
-												{documentData.chunks.map((chunk, idx) => {
+												{allChunks.map((chunk, idx) => {
+													const absNum = absoluteStart + idx + 1;
 													const isCited = chunk.id === chunkId;
 													const isActive = activeChunkIndex === idx;
 													return (
@@ -478,9 +543,9 @@ export function SourceDetailPanel({
 																		? "bg-muted text-foreground"
 																		: "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
 															)}
-															title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`}
+															title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`}
 														>
-															{idx + 1}
+															{absNum}
 															{isCited && (
 																<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
 																	<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
@@ -524,44 +589,11 @@ export function SourceDetailPanel({
 												</motion.div>
 											)}
 
-										{/* Summary Collapsible */}
-										{documentData.content && (
-											<motion.div
-												initial={{ opacity: 0, y: 10 }}
-												animate={{ opacity: 1, y: 0 }}
-												transition={{ delay: 0.15 }}
-											>
-												<Collapsible open={summaryOpen} onOpenChange={setSummaryOpen}>
-													<CollapsibleTrigger className="w-full flex items-center justify-between p-5 rounded-2xl bg-linear-to-r from-muted/50 to-muted/30 border hover:from-muted/70 hover:to-muted/50 transition-all duration-200">
-														<span className="font-semibold flex items-center gap-2">
-															<BookOpen className="h-4 w-4" />
-															Document Summary
-														</span>
-														<motion.div
-															animate={{ rotate: summaryOpen ? 180 : 0 }}
-															transition={{ duration: 0.2 }}
-														>
-															<ChevronDown className="h-5 w-5 text-muted-foreground" />
-														</motion.div>
-													</CollapsibleTrigger>
-													<CollapsibleContent>
-														<motion.div
-															initial={{ opacity: 0 }}
-															animate={{ opacity: 1 }}
-															className="mt-3 p-5 bg-muted/20 rounded-2xl border"
-														>
-															<MarkdownViewer content={documentData.content} />
-														</motion.div>
-													</CollapsibleContent>
-												</Collapsible>
-											</motion.div>
-										)}
-
 										{/* Chunks Header */}
-										<div className="flex items-center justify-between pt-4">
+										<div className="flex items-center justify-between pt-2">
 											<h3 className="text-sm font-semibold text-muted-foreground uppercase tracking-wider flex items-center gap-2">
 												<Hash className="h-4 w-4" />
-												Content Chunks
+												Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}
 											</h3>
 											{citedChunkIndex !== -1 && (
 												<Button
@@ -576,24 +608,70 @@ export function SourceDetailPanel({
 											)}
 										</div>
 
+										{/* Load Earlier */}
+										{canLoadBefore && (
+											<div className="flex items-center justify-center">
+												<Button
+													variant="outline"
+													size="sm"
+													onClick={loadBefore}
+													disabled={loadingBefore}
+													className="gap-2"
+												>
+													{loadingBefore ? (
+														<Loader2 className="h-3.5 w-3.5 animate-spin" />
+													) : (
+														<ChevronUp className="h-3.5 w-3.5" />
+													)}
+													{loadingBefore
+														? "Loading..."
+														: `Load ${Math.min(EXPAND_SIZE, absoluteStart)} earlier chunks`}
+												</Button>
+											</div>
+										)}
+
 										{/* Chunks */}
 										<div className="space-y-4">
-											{documentData.chunks.map((chunk, idx) => {
+											{allChunks.map((chunk, idx) => {
 												const isCited = chunk.id === chunkId;
+												const chunkNumber = absoluteStart + idx + 1;
 												return (
 													<ChunkCard
 														key={chunk.id}
 														ref={isCited ? citedChunkRefCallback : undefined}
 														chunk={chunk}
-														index={idx}
-														totalChunks={documentData.chunks.length}
+														localIndex={idx}
+														chunkNumber={chunkNumber}
+														totalChunks={totalChunks}
 														isCited={isCited}
 														isActive={activeChunkIndex === idx}
-														disableLayoutAnimation={documentData.chunks.length > 30}
+														disableLayoutAnimation={allChunks.length > 30}
 													/>
 												);
 											})}
 										</div>
+
+										{/* Load Later */}
+										{canLoadAfter && (
+											<div className="flex items-center justify-center py-3">
+												<Button
+													variant="outline"
+													size="sm"
+													onClick={loadAfter}
+													disabled={loadingAfter}
+													className="gap-2"
+												>
+													{loadingAfter ? (
+														<Loader2 className="h-3.5 w-3.5 animate-spin" />
+													) : (
+														<ChevronDown className="h-3.5 w-3.5" />
+													)}
+													{loadingAfter
+														? "Loading..."
+														: `Load ${Math.min(EXPAND_SIZE, totalChunks - absoluteEnd)} later chunks`}
+												</Button>
+											</div>
+										)}
 									</div>
 								</ScrollArea>
 							</div>
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 6817b19db..faa042d8e 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -1,10 +1,10 @@
 "use client";
 
 import { useAtom } from "jotai";
-import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react";
+import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react";
 
 import { useTranslations } from "next-intl";
-import { useCallback, useMemo, useRef, useState } from "react";
+import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react";
 import { useDropzone } from "react-dropzone";
 import { toast } from "sonner";
 import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
@@ -51,6 +51,7 @@ const commonTypes = {
 	"application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"],
 	"text/html": [".html", ".htm"],
 	"text/csv": [".csv"],
+	"text/tab-separated-values": [".tsv"],
 	"image/jpeg": [".jpg", ".jpeg"],
 	"image/png": [".png"],
 	"image/bmp": [".bmp"],
@@ -76,7 +77,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
 		"application/rtf": [".rtf"],
 		"application/xml": [".xml"],
 		"application/epub+zip": [".epub"],
-		"text/tab-separated-values": [".tsv"],
 		"text/html": [".html", ".htm", ".web"],
 		"image/gif": [".gif"],
 		"image/svg+xml": [".svg"],
@@ -102,7 +102,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
 		"application/vnd.ms-powerpoint": [".ppt"],
 		"text/x-rst": [".rst"],
 		"application/rtf": [".rtf"],
-		"text/tab-separated-values": [".tsv"],
 		"application/vnd.ms-excel": [".xls"],
 		"application/xml": [".xml"],
 		...audioFileTypes,
@@ -116,10 +115,8 @@ interface FileWithId {
 
 const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5";
 
-// Upload limits — files are sent in batches of 5 to avoid proxy timeouts
-const MAX_FILES = 50;
-const MAX_TOTAL_SIZE_MB = 200;
-const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024;
+const MAX_FILE_SIZE_MB = 500;
+const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
 
 export function DocumentUploadTab({
 	searchSpaceId,
@@ -134,6 +131,7 @@ export function DocumentUploadTab({
 	const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
 	const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
 	const fileInputRef = useRef<HTMLInputElement>(null);
+	const folderInputRef = useRef<HTMLInputElement>(null);
 
 	const acceptedFileTypes = useMemo(() => {
 		const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
@@ -145,49 +143,76 @@ export function DocumentUploadTab({
 		[acceptedFileTypes]
 	);
 
-	const onDrop = useCallback(
-		(acceptedFiles: File[]) => {
+	const supportedExtensionsSet = useMemo(
+		() => new Set(supportedExtensions.map((ext) => ext.toLowerCase())),
+		[supportedExtensions]
+	);
+
+	const addFiles = useCallback(
+		(incoming: File[]) => {
+			const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES);
+			if (oversized.length > 0) {
+				toast.error(t("file_too_large"), {
+					description: t("file_too_large_desc", {
+						name: oversized[0].name,
+						maxMB: MAX_FILE_SIZE_MB,
+					}),
+				});
+			}
+			const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
+			if (valid.length === 0) return;
+
 			setFiles((prev) => {
-				const newEntries = acceptedFiles.map((f) => ({
+				const newEntries = valid.map((f) => ({
 					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
 					file: f,
 				}));
-				const newFiles = [...prev, ...newEntries];
-
-				if (newFiles.length > MAX_FILES) {
-					toast.error(t("max_files_exceeded"), {
-						description: t("max_files_exceeded_desc", { max: MAX_FILES }),
-					});
-					return prev;
-				}
-
-				const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0);
-				if (newTotalSize > MAX_TOTAL_SIZE_BYTES) {
-					toast.error(t("max_size_exceeded"), {
-						description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }),
-					});
-					return prev;
-				}
-
-				return newFiles;
+				return [...prev, ...newEntries];
 			});
 		},
 		[t]
 	);
 
+	const onDrop = useCallback(
+		(acceptedFiles: File[]) => {
+			addFiles(acceptedFiles);
+		},
+		[addFiles]
+	);
+
 	const { getRootProps, getInputProps, isDragActive } = useDropzone({
 		onDrop,
 		accept: acceptedFileTypes,
-		maxSize: 50 * 1024 * 1024, // 50MB per file
+		maxSize: MAX_FILE_SIZE_BYTES,
 		noClick: false,
-		disabled: files.length >= MAX_FILES,
 	});
 
-	// Handle file input click to prevent event bubbling that might reopen dialog
 	const handleFileInputClick = useCallback((e: React.MouseEvent<HTMLInputElement>) => {
 		e.stopPropagation();
 	}, []);
 
+	const handleFolderChange = useCallback(
+		(e: ChangeEvent<HTMLInputElement>) => {
+			const fileList = e.target.files;
+			if (!fileList || fileList.length === 0) return;
+
+			const folderFiles = Array.from(fileList).filter((f) => {
+				const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
+				return ext !== "" && supportedExtensionsSet.has(ext);
+			});
+
+			if (folderFiles.length === 0) {
+				toast.error(t("no_supported_files_in_folder"));
+				e.target.value = "";
+				return;
+			}
+
+			addFiles(folderFiles);
+			e.target.value = "";
+		},
+		[addFiles, supportedExtensionsSet, t]
+	);
+
 	const formatFileSize = (bytes: number) => {
 		if (bytes === 0) return "0 Bytes";
 		const k = 1024;
@@ -198,15 +223,6 @@ export function DocumentUploadTab({
 
 	const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
 
-	// Check if limits are reached
-	const isFileCountLimitReached = files.length >= MAX_FILES;
-	const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES;
-	const remainingFiles = MAX_FILES - files.length;
-	const remainingSizeMB = Math.max(
-		0,
-		(MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024)
-	).toFixed(1);
-
 	// Track accordion state changes
 	const handleAccordionChange = useCallback(
 		(value: string) => {
@@ -257,11 +273,21 @@ export function DocumentUploadTab({
 			<Alert className="border border-border bg-slate-400/5 dark:bg-white/5">
 				<Info className="h-4 w-4 shrink-0 mt-0.5" />
 				<AlertDescription className="text-xs sm:text-sm leading-relaxed pt-0.5">
-					{t("file_size_limit")}{" "}
-					{t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}
+					{t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "}
+					{t("upload_limits")}
 				</AlertDescription>
 			</Alert>
 
+			{/* Hidden folder input */}
+			<input
+				ref={folderInputRef}
+				type="file"
+				className="hidden"
+				onChange={handleFolderChange}
+				multiple
+				{...({ webkitdirectory: "", directory: "" } as React.InputHTMLAttributes<HTMLInputElement>)}
+			/>
+
 			<Card className={`relative overflow-hidden ${cardClass}`}>
 				<div className="absolute inset-0 [mask-image:radial-gradient(ellipse_at_center,white,transparent)] opacity-30">
 					<GridPattern />
@@ -269,11 +295,7 @@ export function DocumentUploadTab({
 				<CardContent className="p-4 sm:p-10 relative z-10">
 					<div
 						{...getRootProps()}
-						className={`flex flex-col items-center justify-center min-h-[200px] sm:min-h-[300px] border-2 border-dashed rounded-lg transition-colors ${
-							isFileCountLimitReached || isSizeLimitReached
-								? "border-destructive/50 bg-destructive/5 cursor-not-allowed"
-								: "border-border hover:border-primary/50 cursor-pointer"
-						}`}
+						className="flex flex-col items-center justify-center min-h-[200px] sm:min-h-[300px] border-2 border-dashed rounded-lg transition-colors border-border hover:border-primary/50 cursor-pointer"
 					>
 						<input
 							{...getInputProps()}
@@ -281,19 +303,7 @@ export function DocumentUploadTab({
 							className="hidden"
 							onClick={handleFileInputClick}
 						/>
-						{isFileCountLimitReached ? (
-							<div className="flex flex-col items-center gap-2 sm:gap-4 text-center px-4">
-								<Upload className="h-8 w-8 sm:h-12 sm:w-12 text-destructive/70" />
-								<div>
-									<p className="text-sm sm:text-lg font-medium text-destructive">
-										{t("file_limit_reached")}
-									</p>
-									<p className="text-xs sm:text-sm text-muted-foreground mt-1">
-										{t("file_limit_reached_desc", { max: MAX_FILES })}
-									</p>
-								</div>
-							</div>
-						) : isDragActive ? (
+						{isDragActive ? (
 							<div className="flex flex-col items-center gap-2 sm:gap-4">
 								<Upload className="h-8 w-8 sm:h-12 sm:w-12 text-primary" />
 								<p className="text-sm sm:text-lg font-medium text-primary">{t("drop_files")}</p>
@@ -305,29 +315,35 @@ export function DocumentUploadTab({
 									<p className="text-sm sm:text-lg font-medium">{t("drag_drop")}</p>
 									<p className="text-xs sm:text-sm text-muted-foreground mt-1">{t("or_browse")}</p>
 								</div>
-								{files.length > 0 && (
-									<p className="text-xs text-muted-foreground">
-										{t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })}
-									</p>
-								)}
-							</div>
-						)}
-						{!isFileCountLimitReached && (
-							<div className="mt-2 sm:mt-4">
-								<Button
-									variant="secondary"
-									size="sm"
-									className="text-xs sm:text-sm"
-									onClick={(e) => {
-										e.stopPropagation();
-										e.preventDefault();
-										fileInputRef.current?.click();
-									}}
-								>
-									{t("browse_files")}
-								</Button>
 							</div>
 						)}
+						<div className="mt-2 sm:mt-4 flex gap-2">
+							<Button
+								variant="secondary"
+								size="sm"
+								className="text-xs sm:text-sm"
+								onClick={(e) => {
+									e.stopPropagation();
+									e.preventDefault();
+									fileInputRef.current?.click();
+								}}
+							>
+								{t("browse_files")}
+							</Button>
+							<Button
+								variant="outline"
+								size="sm"
+								className="text-xs sm:text-sm"
+								onClick={(e) => {
+									e.stopPropagation();
+									e.preventDefault();
+									folderInputRef.current?.click();
+								}}
+							>
+								<FolderOpen className="h-4 w-4 mr-1.5" />
+								{t("browse_folder")}
+							</Button>
+						</div>
 					</div>
 				</CardContent>
 			</Card>
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index 1a3326bae..f5431aecb 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -39,6 +39,7 @@ export const document = z.object({
 	document_type: documentTypeEnum,
 	document_metadata: z.record(z.string(), z.any()),
 	content: z.string(),
+	content_preview: z.string().optional().default(""),
 	content_hash: z.string(),
 	unique_identifier_hash: z.string().nullable(),
 	created_at: z.string(),
@@ -69,6 +70,8 @@ export const documentWithChunks = document.extend({
 			created_at: z.string(),
 		})
 	),
+	total_chunks: z.number().optional().default(0),
+	chunk_start_index: z.number().optional().default(0),
 });
 
 /**
@@ -243,10 +246,36 @@ export const getDocumentTypeCountsResponse = z.record(z.string(), z.number());
  */
 export const getDocumentByChunkRequest = z.object({
 	chunk_id: z.number(),
+	chunk_window: z.number().optional(),
 });
 
 export const getDocumentByChunkResponse = documentWithChunks;
 
+/**
+ * Get paginated chunks for a document
+ */
+export const getDocumentChunksRequest = z.object({
+	document_id: z.number(),
+	page: z.number().optional().default(0),
+	page_size: z.number().optional().default(20),
+	start_offset: z.number().optional(),
+});
+
+export const chunkRead = z.object({
+	id: z.number(),
+	content: z.string(),
+	document_id: z.number(),
+	created_at: z.string(),
+});
+
+export const getDocumentChunksResponse = z.object({
+	items: z.array(chunkRead),
+	total: z.number(),
+	page: z.number(),
+	page_size: z.number(),
+	has_more: z.boolean(),
+});
+
 /**
  * Get Surfsense docs by chunk
  */
@@ -328,3 +357,6 @@ export type GetSurfsenseDocsByChunkRequest = z.infer<typeof getSurfsenseDocsByCh
 export type GetSurfsenseDocsByChunkResponse = z.infer<typeof getSurfsenseDocsByChunkResponse>;
 export type GetSurfsenseDocsRequest = z.infer<typeof getSurfsenseDocsRequest>;
 export type GetSurfsenseDocsResponse = z.infer<typeof getSurfsenseDocsResponse>;
+export type GetDocumentChunksRequest = z.infer<typeof getDocumentChunksRequest>;
+export type GetDocumentChunksResponse = z.infer<typeof getDocumentChunksResponse>;
+export type ChunkRead = z.infer<typeof chunkRead>;
diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts
index 14a247032..71fa58852 100644
--- a/surfsense_web/lib/apis/documents-api.service.ts
+++ b/surfsense_web/lib/apis/documents-api.service.ts
@@ -6,6 +6,7 @@ import {
 	deleteDocumentRequest,
 	deleteDocumentResponse,
 	type GetDocumentByChunkRequest,
+	type GetDocumentChunksRequest,
 	type GetDocumentRequest,
 	type GetDocumentsRequest,
 	type GetDocumentsStatusRequest,
@@ -13,6 +14,8 @@ import {
 	type GetSurfsenseDocsRequest,
 	getDocumentByChunkRequest,
 	getDocumentByChunkResponse,
+	getDocumentChunksRequest,
+	getDocumentChunksResponse,
 	getDocumentRequest,
 	getDocumentResponse,
 	getDocumentsRequest,
@@ -295,23 +298,52 @@ class DocumentsApiService {
 	};
 
 	/**
-	 * Get document by chunk ID (includes all chunks)
+	 * Get document by chunk ID (includes a window of chunks around the cited one)
 	 */
 	getDocumentByChunk = async (request: GetDocumentByChunkRequest) => {
-		// Validate the request
 		const parsedRequest = getDocumentByChunkRequest.safeParse(request);
 
 		if (!parsedRequest.success) {
 			console.error("Invalid request:", parsedRequest.error);
 
-			// Format a user friendly error message
 			const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
 			throw new ValidationError(`Invalid request: ${errorMessage}`);
 		}
 
+		const params = new URLSearchParams();
+		if (request.chunk_window != null) {
+			params.set("chunk_window", String(request.chunk_window));
+		}
+		const qs = params.toString();
+		const url = `/api/v1/documents/by-chunk/${request.chunk_id}${qs ? `?${qs}` : ""}`;
+
+		return baseApiService.get(url, getDocumentByChunkResponse);
+	};
+
+	/**
+	 * Get paginated chunks for a document
+	 */
+	getDocumentChunks = async (request: GetDocumentChunksRequest) => {
+		const parsedRequest = getDocumentChunksRequest.safeParse(request);
+
+		if (!parsedRequest.success) {
+			console.error("Invalid request:", parsedRequest.error);
+
+			const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
+			throw new ValidationError(`Invalid request: ${errorMessage}`);
+		}
+
+		const params = new URLSearchParams({
+			page: String(parsedRequest.data.page),
+			page_size: String(parsedRequest.data.page_size),
+		});
+		if (parsedRequest.data.start_offset != null) {
+			params.set("start_offset", String(parsedRequest.data.start_offset));
+		}
+
 		return baseApiService.get(
-			`/api/v1/documents/by-chunk/${request.chunk_id}`,
-			getDocumentByChunkResponse
+			`/api/v1/documents/${parsedRequest.data.document_id}/chunks?${params}`,
+			getDocumentChunksResponse
 		);
 	};
 
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json
index 53f80ea5f..cacaec557 100644
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@@ -376,12 +376,13 @@
 	"upload_documents": {
 		"title": "Upload Documents",
 		"subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.",
-		"file_size_limit": "Maximum file size: 50MB per file.",
-		"upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total.",
-		"drop_files": "Drop files here",
-		"drag_drop": "Drag & drop files here",
-		"or_browse": "or click to browse",
+		"file_size_limit": "Maximum file size: {maxMB}MB per file.",
+		"upload_limits": "Upload files or entire folders",
+		"drop_files": "Drop files or folders here",
+		"drag_drop": "Drag & drop files or folders here",
+		"or_browse": "or click to browse files and folders",
 		"browse_files": "Browse Files",
+		"browse_folder": "Browse Folder",
 		"selected_files": "Selected Files ({count})",
 		"total_size": "Total size",
 		"clear_all": "Clear all",
@@ -394,13 +395,9 @@
 		"upload_error_desc": "Error uploading files",
 		"supported_file_types": "Supported File Types",
 		"file_types_desc": "These file types are supported based on your current ETL service configuration.",
-		"max_files_exceeded": "File Limit Exceeded",
-		"max_files_exceeded_desc": "You can upload a maximum of {max} files at a time.",
-		"max_size_exceeded": "Size Limit Exceeded",
-		"max_size_exceeded_desc": "Total file size cannot exceed {max}MB.",
-		"file_limit_reached": "Maximum Files Reached",
-		"file_limit_reached_desc": "Remove some files to add more (max {max} files).",
-		"remaining_capacity": "{files} files remaining • {sizeMB}MB available"
+		"file_too_large": "File Too Large",
+		"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
+		"no_supported_files_in_folder": "No supported file types found in the selected folder."
 	},
 	"add_webpage": {
 		"title": "Add Webpages for Crawling",
diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json
index 36e627295..7670e76df 100644
--- a/surfsense_web/messages/es.json
+++ b/surfsense_web/messages/es.json
@@ -376,12 +376,13 @@
 	"upload_documents": {
 		"title": "Subir documentos",
 		"subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.",
-		"file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo.",
-		"upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total.",
-		"drop_files": "Suelta los archivos aquí",
-		"drag_drop": "Arrastra y suelta archivos aquí",
-		"or_browse": "o haz clic para explorar",
+		"file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo.",
+		"upload_limits": "Sube archivos o carpetas enteras",
+		"drop_files": "Suelta archivos o carpetas aquí",
+		"drag_drop": "Arrastra y suelta archivos o carpetas aquí",
+		"or_browse": "o haz clic para explorar archivos y carpetas",
 		"browse_files": "Explorar archivos",
+		"browse_folder": "Explorar carpeta",
 		"selected_files": "Archivos seleccionados ({count})",
 		"total_size": "Tamaño total",
 		"clear_all": "Limpiar todo",
@@ -394,13 +395,9 @@
 		"upload_error_desc": "Error al subir archivos",
 		"supported_file_types": "Tipos de archivo soportados",
 		"file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.",
-		"max_files_exceeded": "Límite de archivos excedido",
-		"max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez.",
-		"max_size_exceeded": "Límite de tamaño excedido",
-		"max_size_exceeded_desc": "El tamaño total de los archivos no puede exceder {max} MB.",
-		"file_limit_reached": "Máximo de archivos alcanzado",
-		"file_limit_reached_desc": "Elimina algunos archivos para agregar más (máximo {max} archivos).",
-		"remaining_capacity": "{files} archivos restantes • {sizeMB} MB disponibles"
+		"file_too_large": "Archivo demasiado grande",
+		"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
+		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
 	},
 	"add_webpage": {
 		"title": "Agregar páginas web para rastreo",
diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json
index fd51acdc2..cbcff0b30 100644
--- a/surfsense_web/messages/hi.json
+++ b/surfsense_web/messages/hi.json
@@ -376,12 +376,13 @@
 	"upload_documents": {
 		"title": "दस्तावेज़ अपलोड करें",
 		"subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।",
-		"file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB।",
-		"upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB।",
-		"drop_files": "फ़ाइलें यहां छोड़ें",
-		"drag_drop": "फ़ाइलें यहां खींचें और छोड़ें",
-		"or_browse": "या ब्राउज़ करने के लिए क्लिक करें",
+		"file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB।",
+		"upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें",
+		"drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें",
+		"drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें",
+		"or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें",
 		"browse_files": "फ़ाइलें ब्राउज़ करें",
+		"browse_folder": "फ़ोल्डर ब्राउज़ करें",
 		"selected_files": "चयनित फ़ाइलें ({count})",
 		"total_size": "कुल आकार",
 		"clear_all": "सभी साफ करें",
@@ -394,13 +395,9 @@
 		"upload_error_desc": "फ़ाइलें अपलोड करने में त्रुटि",
 		"supported_file_types": "समर्थित फ़ाइल प्रकार",
 		"file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।",
-		"max_files_exceeded": "फ़ाइल सीमा पार हो गई",
-		"max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं।",
-		"max_size_exceeded": "आकार सीमा पार हो गई",
-		"max_size_exceeded_desc": "कुल फ़ाइल आकार {max}MB से अधिक नहीं हो सकता।",
-		"file_limit_reached": "अधिकतम फ़ाइलें पहुंच गई",
-		"file_limit_reached_desc": "और जोड़ने के लिए कुछ फ़ाइलें हटाएं (अधिकतम {max} फ़ाइलें)।",
-		"remaining_capacity": "{files} फ़ाइलें शेष • {sizeMB}MB उपलब्ध"
+		"file_too_large": "फ़ाइल बहुत बड़ी है",
+		"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
+		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
 	},
 	"add_webpage": {
 		"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json
index e26499f90..ec72ef0da 100644
--- a/surfsense_web/messages/pt.json
+++ b/surfsense_web/messages/pt.json
@@ -376,12 +376,13 @@
 	"upload_documents": {
 		"title": "Enviar documentos",
 		"subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.",
-		"file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo.",
-		"upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total.",
-		"drop_files": "Solte os arquivos aqui",
-		"drag_drop": "Arraste e solte arquivos aqui",
-		"or_browse": "ou clique para navegar",
+		"file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo.",
+		"upload_limits": "Envie arquivos ou pastas inteiras",
+		"drop_files": "Solte arquivos ou pastas aqui",
+		"drag_drop": "Arraste e solte arquivos ou pastas aqui",
+		"or_browse": "ou clique para navegar arquivos e pastas",
 		"browse_files": "Navegar arquivos",
+		"browse_folder": "Navegar pasta",
 		"selected_files": "Arquivos selecionados ({count})",
 		"total_size": "Tamanho total",
 		"clear_all": "Limpar tudo",
@@ -394,13 +395,9 @@
 		"upload_error_desc": "Erro ao enviar arquivos",
 		"supported_file_types": "Tipos de arquivo suportados",
 		"file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.",
-		"max_files_exceeded": "Limite de arquivos excedido",
-		"max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez.",
-		"max_size_exceeded": "Limite de tamanho excedido",
-		"max_size_exceeded_desc": "O tamanho total dos arquivos não pode exceder {max} MB.",
-		"file_limit_reached": "Máximo de arquivos atingido",
-		"file_limit_reached_desc": "Remova alguns arquivos para adicionar mais (máximo {max} arquivos).",
-		"remaining_capacity": "{files} arquivos restantes • {sizeMB} MB disponíveis"
+		"file_too_large": "Arquivo muito grande",
+		"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
+		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
 	},
 	"add_webpage": {
 		"title": "Adicionar páginas web para rastreamento",
diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json
index 819432410..db634dfd9 100644
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@@ -360,12 +360,13 @@
 	"upload_documents": {
 		"title": "上传文档",
 		"subtitle": "上传您的文件，使其可通过 AI 对话进行搜索和访问。",
-		"file_size_limit": "最大文件大小：每个文件 50MB。",
-		"upload_limits": "上传限制：最多 {maxFiles} 个文件，总大小不超过 {maxSizeMB}MB。",
-		"drop_files": "放下文件到这里",
-		"drag_drop": "拖放文件到这里",
-		"or_browse": "或点击浏览",
+		"file_size_limit": "最大文件大小：每个文件 {maxMB}MB。",
+		"upload_limits": "上传文件或整个文件夹",
+		"drop_files": "将文件或文件夹拖放到此处",
+		"drag_drop": "将文件或文件夹拖放到此处",
+		"or_browse": "或点击浏览文件和文件夹",
 		"browse_files": "浏览文件",
+		"browse_folder": "浏览文件夹",
 		"selected_files": "已选择的文件 ({count})",
 		"total_size": "总大小",
 		"clear_all": "全部清除",
@@ -378,13 +379,9 @@
 		"upload_error_desc": "上传文件时出错",
 		"supported_file_types": "支持的文件类型",
 		"file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。",
-		"max_files_exceeded": "超过文件数量限制",
-		"max_files_exceeded_desc": "一次最多只能上传 {max} 个文件。",
-		"max_size_exceeded": "超过文件大小限制",
-		"max_size_exceeded_desc": "文件总大小不能超过 {max}MB。",
-		"file_limit_reached": "已达到最大文件数量",
-		"file_limit_reached_desc": "移除一些文件以添加更多（最多 {max} 个文件）。",
-		"remaining_capacity": "剩余 {files} 个文件名额 • 可用 {sizeMB}MB"
+		"file_too_large": "文件过大",
+		"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
+		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
 	},
 	"add_webpage": {
 		"title": "添加网页爬取",