From 5d22349dc102e3e87b42e20a923ad79df1ecae51 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:25:25 +0530
Subject: [PATCH 01/37] feat: implement ETL pipeline with file classification
 and extraction services

---
 .../app/etl_pipeline/__init__.py              |  0
 .../app/etl_pipeline/constants.py             | 39 ++++++++++
 .../app/etl_pipeline/etl_document.py          | 21 ++++++
 .../app/etl_pipeline/etl_pipeline_service.py  | 73 +++++++++++++++++++
 .../app/etl_pipeline/exceptions.py            |  6 ++
 .../app/etl_pipeline/file_classifier.py       | 49 +++++++++++++
 6 files changed, 188 insertions(+)
 create mode 100644 surfsense_backend/app/etl_pipeline/__init__.py
 create mode 100644 surfsense_backend/app/etl_pipeline/constants.py
 create mode 100644 surfsense_backend/app/etl_pipeline/etl_document.py
 create mode 100644 surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
 create mode 100644 surfsense_backend/app/etl_pipeline/exceptions.py
 create mode 100644 surfsense_backend/app/etl_pipeline/file_classifier.py

diff --git a/surfsense_backend/app/etl_pipeline/__init__.py b/surfsense_backend/app/etl_pipeline/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/surfsense_backend/app/etl_pipeline/constants.py b/surfsense_backend/app/etl_pipeline/constants.py
new file mode 100644
index 000000000..f65759c13
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/constants.py
@@ -0,0 +1,39 @@
+import ssl
+
+import httpx
+
+LLAMACLOUD_MAX_RETRIES = 5
+LLAMACLOUD_BASE_DELAY = 10
+LLAMACLOUD_MAX_DELAY = 120
+LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
+    ssl.SSLError,
+    httpx.ConnectError,
+    httpx.ConnectTimeout,
+    httpx.ReadError,
+    httpx.ReadTimeout,
+    httpx.WriteError,
+    httpx.WriteTimeout,
+    httpx.RemoteProtocolError,
+    httpx.LocalProtocolError,
+    ConnectionError,
+    ConnectionResetError,
+    TimeoutError,
+    OSError,
+)
+
+UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024
+MIN_UPLOAD_TIMEOUT = 120
+MAX_UPLOAD_TIMEOUT = 1800
+BASE_JOB_TIMEOUT = 600
+PER_PAGE_JOB_TIMEOUT = 60
+
+
+def calculate_upload_timeout(file_size_bytes: int) -> float:
+    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
+    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
+
+
+def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
+    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
+    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
+    return max(page_based_timeout, size_based_timeout)
diff --git a/surfsense_backend/app/etl_pipeline/etl_document.py b/surfsense_backend/app/etl_pipeline/etl_document.py
new file mode 100644
index 000000000..350c3299f
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/etl_document.py
@@ -0,0 +1,21 @@
+from pydantic import BaseModel, field_validator
+
+
+class EtlRequest(BaseModel):
+    file_path: str
+    filename: str
+    estimated_pages: int = 0
+
+    @field_validator("filename")
+    @classmethod
+    def filename_must_not_be_empty(cls, v: str) -> str:
+        if not v.strip():
+            raise ValueError("filename must not be empty")
+        return v
+
+
+class EtlResult(BaseModel):
+    markdown_content: str
+    etl_service: str
+    actual_pages: int = 0
+    content_type: str
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
new file mode 100644
index 000000000..f382451df
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -0,0 +1,73 @@
+from app.config import config as app_config
+from app.etl_pipeline.etl_document import EtlRequest, EtlResult
+from app.etl_pipeline.exceptions import EtlServiceUnavailableError
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.parsers.audio import transcribe_audio
+from app.etl_pipeline.parsers.direct_convert import convert_file_directly
+from app.etl_pipeline.parsers.plaintext import read_plaintext
+
+
+class EtlPipelineService:
+    """Single pipeline for extracting markdown from files. All callers use this."""
+
+    async def extract(self, request: EtlRequest) -> EtlResult:
+        category = classify_file(request.filename)
+
+        if category == FileCategory.PLAINTEXT:
+            content = read_plaintext(request.file_path)
+            return EtlResult(
+                markdown_content=content,
+                etl_service="PLAINTEXT",
+                content_type="plaintext",
+            )
+
+        if category == FileCategory.DIRECT_CONVERT:
+            content = convert_file_directly(request.file_path, request.filename)
+            return EtlResult(
+                markdown_content=content,
+                etl_service="DIRECT_CONVERT",
+                content_type="direct_convert",
+            )
+
+        if category == FileCategory.AUDIO:
+            content = await transcribe_audio(request.file_path, request.filename)
+            return EtlResult(
+                markdown_content=content,
+                etl_service="AUDIO",
+                content_type="audio",
+            )
+
+        return await self._extract_document(request)
+
+    async def _extract_document(self, request: EtlRequest) -> EtlResult:
+        etl_service = app_config.ETL_SERVICE
+        if not etl_service:
+            raise EtlServiceUnavailableError(
+                "No ETL_SERVICE configured. "
+                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
+            )
+
+        if etl_service == "DOCLING":
+            from app.etl_pipeline.parsers.docling import parse_with_docling
+
+            content = await parse_with_docling(request.file_path, request.filename)
+        elif etl_service == "UNSTRUCTURED":
+            from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
+
+            content = await parse_with_unstructured(request.file_path)
+        elif etl_service == "LLAMACLOUD":
+            from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
+
+            content = await parse_with_llamacloud(
+                request.file_path, request.estimated_pages
+            )
+        else:
+            raise EtlServiceUnavailableError(
+                f"Unknown ETL_SERVICE: {etl_service}"
+            )
+
+        return EtlResult(
+            markdown_content=content,
+            etl_service=etl_service,
+            content_type="document",
+        )
diff --git a/surfsense_backend/app/etl_pipeline/exceptions.py b/surfsense_backend/app/etl_pipeline/exceptions.py
new file mode 100644
index 000000000..ac8fc0172
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/exceptions.py
@@ -0,0 +1,6 @@
+class EtlParseError(Exception):
+    """Raised when an ETL parser fails to produce content."""
+
+
+class EtlServiceUnavailableError(Exception):
+    """Raised when the configured ETL_SERVICE is not recognised."""
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
new file mode 100644
index 000000000..40c2d5aff
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -0,0 +1,49 @@
+from enum import Enum
+from pathlib import PurePosixPath
+
+
+PLAINTEXT_EXTENSIONS = frozenset(
+    {
+        ".md", ".markdown", ".txt", ".text",
+        ".json", ".jsonl", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".xml",
+        ".css", ".scss", ".less", ".sass",
+        ".py", ".pyw", ".pyi", ".pyx",
+        ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
+        ".java", ".kt", ".kts", ".scala", ".groovy",
+        ".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx",
+        ".cs", ".fs", ".fsx",
+        ".go", ".rs", ".rb", ".php", ".pl", ".pm", ".lua", ".swift",
+        ".m", ".mm", ".r", ".jl",
+        ".sh", ".bash", ".zsh", ".fish", ".bat", ".cmd", ".ps1",
+        ".sql", ".graphql", ".gql",
+        ".env", ".gitignore", ".dockerignore", ".editorconfig",
+        ".makefile", ".cmake",
+        ".log", ".rst", ".tex", ".bib", ".org", ".adoc", ".asciidoc",
+        ".vue", ".svelte", ".astro",
+        ".tf", ".hcl", ".proto",
+    }
+)
+
+AUDIO_EXTENSIONS = frozenset(
+    {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
+)
+
+DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
+
+
+class FileCategory(Enum):
+    PLAINTEXT = "plaintext"
+    AUDIO = "audio"
+    DIRECT_CONVERT = "direct_convert"
+    DOCUMENT = "document"
+
+
+def classify_file(filename: str) -> FileCategory:
+    suffix = PurePosixPath(filename).suffix.lower()
+    if suffix in PLAINTEXT_EXTENSIONS:
+        return FileCategory.PLAINTEXT
+    if suffix in AUDIO_EXTENSIONS:
+        return FileCategory.AUDIO
+    if suffix in DIRECT_CONVERT_EXTENSIONS:
+        return FileCategory.DIRECT_CONVERT
+    return FileCategory.DOCUMENT

From 02fc6f1d1616de98a566d6925f96061a86a114db Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:26:03 +0530
Subject: [PATCH 02/37] feat: add audio transcription functionality to ETL
 pipeline

---
 .../app/etl_pipeline/parsers/__init__.py      |  0
 .../app/etl_pipeline/parsers/audio.py         | 34 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/__init__.py
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/audio.py

diff --git a/surfsense_backend/app/etl_pipeline/parsers/__init__.py b/surfsense_backend/app/etl_pipeline/parsers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/surfsense_backend/app/etl_pipeline/parsers/audio.py b/surfsense_backend/app/etl_pipeline/parsers/audio.py
new file mode 100644
index 000000000..cd49bafde
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/audio.py
@@ -0,0 +1,34 @@
+from litellm import atranscription
+
+from app.config import config as app_config
+
+
+async def transcribe_audio(file_path: str, filename: str) -> str:
+    stt_service_type = (
+        "local"
+        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+        else "external"
+    )
+
+    if stt_service_type == "local":
+        from app.services.stt_service import stt_service
+
+        result = stt_service.transcribe_file(file_path)
+        text = result.get("text", "")
+        if not text:
+            raise ValueError("Transcription returned empty text")
+    else:
+        with open(file_path, "rb") as audio_file:
+            kwargs: dict = {
+                "model": app_config.STT_SERVICE,
+                "file": audio_file,
+                "api_key": app_config.STT_SERVICE_API_KEY,
+            }
+            if app_config.STT_SERVICE_API_BASE:
+                kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+            response = await atranscription(**kwargs)
+            text = response.get("text", "")
+            if not text:
+                raise ValueError("Transcription returned empty text")
+
+    return f"# Transcription of {filename}\n\n{text}"

From 35582c9389a9bc08f11cd603d8d9ed635e5a6218 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:26:29 +0530
Subject: [PATCH 03/37] feat: add direct_convert module to ETL pipeline for
 file conversion

---
 surfsense_backend/app/etl_pipeline/parsers/direct_convert.py | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/direct_convert.py

diff --git a/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py
new file mode 100644
index 000000000..c9e6e8647
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py
@@ -0,0 +1,3 @@
+from app.tasks.document_processors._direct_converters import convert_file_directly
+
+__all__ = ["convert_file_directly"]

From 2824410be225e43d5b22335776ca009c8c1ae2d1 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:26:42 +0530
Subject: [PATCH 04/37] feat: add plaintext parser to ETL pipeline for reading
 text files

---
 surfsense_backend/app/etl_pipeline/parsers/plaintext.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/plaintext.py

diff --git a/surfsense_backend/app/etl_pipeline/parsers/plaintext.py b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py
new file mode 100644
index 000000000..24bfb71e5
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py
@@ -0,0 +1,8 @@
+def read_plaintext(file_path: str) -> str:
+    with open(file_path, encoding="utf-8", errors="replace") as f:
+        content = f.read()
+    if "\x00" in content:
+        raise ValueError(
+            f"File contains null bytes — likely a binary file opened as text: {file_path}"
+        )
+    return content

From f40de6b6954c1ca286a022eebff7e994213d6f26 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:27:24 +0530
Subject: [PATCH 05/37] feat: add parsers for Docling, LlamaCloud, and
 Unstructured to ETL pipeline

---
 .../app/etl_pipeline/parsers/docling.py       |  26 ++++
 .../app/etl_pipeline/parsers/llamacloud.py    | 129 ++++++++++++++++++
 .../app/etl_pipeline/parsers/unstructured.py  |  14 ++
 3 files changed, 169 insertions(+)
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/docling.py
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/unstructured.py

diff --git a/surfsense_backend/app/etl_pipeline/parsers/docling.py b/surfsense_backend/app/etl_pipeline/parsers/docling.py
new file mode 100644
index 000000000..df0498148
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/docling.py
@@ -0,0 +1,26 @@
+import warnings
+from logging import ERROR, getLogger
+
+
+async def parse_with_docling(file_path: str, filename: str) -> str:
+    from app.services.docling_service import create_docling_service
+
+    docling_service = create_docling_service()
+
+    pdfminer_logger = getLogger("pdfminer")
+    original_level = pdfminer_logger.level
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
+        warnings.filterwarnings(
+            "ignore", message=".*Cannot set gray non-stroke color.*"
+        )
+        warnings.filterwarnings("ignore", message=".*invalid float value.*")
+        pdfminer_logger.setLevel(ERROR)
+
+        try:
+            result = await docling_service.process_document(file_path, filename)
+        finally:
+            pdfminer_logger.setLevel(original_level)
+
+    return result["content"]
diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
new file mode 100644
index 000000000..5115aebea
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@@ -0,0 +1,129 @@
+import asyncio
+import logging
+import os
+import random
+
+import httpx
+
+from app.config import config as app_config
+from app.etl_pipeline.constants import (
+    LLAMACLOUD_BASE_DELAY,
+    LLAMACLOUD_MAX_DELAY,
+    LLAMACLOUD_MAX_RETRIES,
+    LLAMACLOUD_RETRYABLE_EXCEPTIONS,
+    PER_PAGE_JOB_TIMEOUT,
+    calculate_job_timeout,
+    calculate_upload_timeout,
+)
+
+
+async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
+    from llama_cloud_services import LlamaParse
+    from llama_cloud_services.parse.utils import ResultType
+
+    file_size_bytes = os.path.getsize(file_path)
+    file_size_mb = file_size_bytes / (1024 * 1024)
+
+    upload_timeout = calculate_upload_timeout(file_size_bytes)
+    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
+
+    custom_timeout = httpx.Timeout(
+        connect=120.0,
+        read=upload_timeout,
+        write=upload_timeout,
+        pool=120.0,
+    )
+
+    logging.info(
+        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
+        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
+        f"job_timeout={job_timeout:.0f}s"
+    )
+
+    last_exception = None
+    attempt_errors: list[str] = []
+
+    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
+        try:
+            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
+                parser = LlamaParse(
+                    api_key=app_config.LLAMA_CLOUD_API_KEY,
+                    num_workers=1,
+                    verbose=True,
+                    language="en",
+                    result_type=ResultType.MD,
+                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
+                    job_timeout_in_seconds=job_timeout,
+                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
+                    custom_client=custom_client,
+                )
+                result = await parser.aparse(file_path)
+
+                if attempt > 1:
+                    logging.info(
+                        f"LlamaCloud upload succeeded on attempt {attempt} after "
+                        f"{len(attempt_errors)} failures"
+                    )
+
+                if hasattr(result, "get_markdown_documents"):
+                    markdown_docs = result.get_markdown_documents(
+                        split_by_page=False
+                    )
+                    if markdown_docs and hasattr(markdown_docs[0], "text"):
+                        return markdown_docs[0].text
+                    if hasattr(result, "pages") and result.pages:
+                        return "\n\n".join(
+                            p.md
+                            for p in result.pages
+                            if hasattr(p, "md") and p.md
+                        )
+                    return str(result)
+
+                if isinstance(result, list):
+                    if result and hasattr(result[0], "text"):
+                        return result[0].text
+                    return "\n\n".join(
+                        doc.page_content
+                        if hasattr(doc, "page_content")
+                        else str(doc)
+                        for doc in result
+                    )
+
+                return str(result)
+
+        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
+            last_exception = e
+            error_type = type(e).__name__
+            error_msg = str(e)[:200]
+            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
+
+            if attempt < LLAMACLOUD_MAX_RETRIES:
+                base_delay = min(
+                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
+                    LLAMACLOUD_MAX_DELAY,
+                )
+                jitter = base_delay * 0.25 * (2 * random.random() - 1)
+                delay = base_delay + jitter
+
+                logging.warning(
+                    f"LlamaCloud upload failed "
+                    f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
+                    f"{error_type}. File: {file_size_mb:.1f}MB. "
+                    f"Retrying in {delay:.0f}s..."
+                )
+                await asyncio.sleep(delay)
+            else:
+                logging.error(
+                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
+                    f"attempts. File size: {file_size_mb:.1f}MB, "
+                    f"Pages: {estimated_pages}. "
+                    f"Errors: {'; '.join(attempt_errors)}"
+                )
+
+        except Exception:
+            raise
+
+    raise last_exception or RuntimeError(
+        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
+        f"File size: {file_size_mb:.1f}MB"
+    )
diff --git a/surfsense_backend/app/etl_pipeline/parsers/unstructured.py b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py
new file mode 100644
index 000000000..af8fb99b6
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py
@@ -0,0 +1,14 @@
+async def parse_with_unstructured(file_path: str) -> str:
+    from langchain_unstructured import UnstructuredLoader
+
+    loader = UnstructuredLoader(
+        file_path,
+        mode="elements",
+        post_processors=[],
+        languages=["eng"],
+        include_orig_elements=False,
+        include_metadata=False,
+        strategy="auto",
+    )
+    docs = await loader.aload()
+    return "\n\n".join(doc.page_content for doc in docs if doc.page_content)

From 1248363ca980916cd5e16df66dbcc3cd37a2e68f Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:29:24 +0530
Subject: [PATCH 06/37] refactor: consolidate document processing logic and
 remove unused files and ETL strategies

---
 .../app/tasks/document_processors/__init__.py |  30 +--
 .../tasks/document_processors/_constants.py   |  74 -------
 .../app/tasks/document_processors/_etl.py     | 209 ------------------
 .../app/tasks/document_processors/_helpers.py |  25 ---
 .../app/tasks/document_processors/_save.py    |  81 -------
 5 files changed, 3 insertions(+), 416 deletions(-)
 delete mode 100644 surfsense_backend/app/tasks/document_processors/_constants.py
 delete mode 100644 surfsense_backend/app/tasks/document_processors/_etl.py

diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py
index 2b5690d02..f82c10883 100644
--- a/surfsense_backend/app/tasks/document_processors/__init__.py
+++ b/surfsense_backend/app/tasks/document_processors/__init__.py
@@ -1,41 +1,17 @@
 """
 Document processors module for background tasks.
 
-This module provides a collection of document processors for different content types
-and sources. Each processor is responsible for handling a specific type of document
-processing task in the background.
-
-Available processors:
-- Extension processor: Handle documents from browser extension
-- Markdown processor: Process markdown files
-- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
-- YouTube processor: Process YouTube videos and extract transcripts
+Content extraction is handled by ``app.etl_pipeline.EtlPipelineService``.
+This package keeps orchestration (save, notify, page-limit) and
+non-ETL processors (extension, markdown, youtube).
 """
 
-# Extension processor
-# File processors (backward-compatible re-exports from _save)
-from ._save import (
-    add_received_file_document_using_docling,
-    add_received_file_document_using_llamacloud,
-    add_received_file_document_using_unstructured,
-)
 from .extension_processor import add_extension_received_document
-
-# Markdown processor
 from .markdown_processor import add_received_markdown_file_document
-
-# YouTube processor
 from .youtube_processor import add_youtube_video_document
 
 __all__ = [
-    # Extension processing
     "add_extension_received_document",
-    # File processing with different ETL services
-    "add_received_file_document_using_docling",
-    "add_received_file_document_using_llamacloud",
-    "add_received_file_document_using_unstructured",
-    # Markdown file processing
     "add_received_markdown_file_document",
-    # YouTube video processing
     "add_youtube_video_document",
 ]
diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py
deleted file mode 100644
index f74d7acce..000000000
--- a/surfsense_backend/app/tasks/document_processors/_constants.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-Constants for file document processing.
-
-Centralizes file type classification, LlamaCloud retry configuration,
-and timeout calculation parameters.
-"""
-
-import ssl
-from enum import Enum
-
-import httpx
-
-# ---------------------------------------------------------------------------
-# File type classification
-# ---------------------------------------------------------------------------
-
-MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
-AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
-DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
-
-
-class FileCategory(Enum):
-    MARKDOWN = "markdown"
-    AUDIO = "audio"
-    DIRECT_CONVERT = "direct_convert"
-    DOCUMENT = "document"
-
-
-def classify_file(filename: str) -> FileCategory:
-    """Classify a file by its extension into a processing category."""
-    lower = filename.lower()
-    if lower.endswith(MARKDOWN_EXTENSIONS):
-        return FileCategory.MARKDOWN
-    if lower.endswith(AUDIO_EXTENSIONS):
-        return FileCategory.AUDIO
-    if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
-        return FileCategory.DIRECT_CONVERT
-    return FileCategory.DOCUMENT
-
-
-# ---------------------------------------------------------------------------
-# LlamaCloud retry configuration
-# ---------------------------------------------------------------------------
-
-LLAMACLOUD_MAX_RETRIES = 5
-LLAMACLOUD_BASE_DELAY = 10  # seconds (exponential backoff base)
-LLAMACLOUD_MAX_DELAY = 120  # max delay between retries (2 minutes)
-LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
-    ssl.SSLError,
-    httpx.ConnectError,
-    httpx.ConnectTimeout,
-    httpx.ReadError,
-    httpx.ReadTimeout,
-    httpx.WriteError,
-    httpx.WriteTimeout,
-    httpx.RemoteProtocolError,
-    httpx.LocalProtocolError,
-    ConnectionError,
-    ConnectionResetError,
-    TimeoutError,
-    OSError,
-)
-
-# ---------------------------------------------------------------------------
-# Timeout calculation constants
-# ---------------------------------------------------------------------------
-
-UPLOAD_BYTES_PER_SECOND_SLOW = (
-    100 * 1024
-)  # 100 KB/s (conservative for slow connections)
-MIN_UPLOAD_TIMEOUT = 120  # Minimum 2 minutes for any file
-MAX_UPLOAD_TIMEOUT = 1800  # Maximum 30 minutes for very large files
-BASE_JOB_TIMEOUT = 600  # 10 minutes base for job processing
-PER_PAGE_JOB_TIMEOUT = 60  # 1 minute per page for processing
diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py
deleted file mode 100644
index cc3a8b1ac..000000000
--- a/surfsense_backend/app/tasks/document_processors/_etl.py
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-ETL parsing strategies for different document processing services.
-
-Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
-LlamaCloud retry logic and dynamic timeout calculations.
-"""
-
-import asyncio
-import logging
-import os
-import random
-import warnings
-from logging import ERROR, getLogger
-
-import httpx
-
-from app.config import config as app_config
-from app.db import Log
-from app.services.task_logging_service import TaskLoggingService
-
-from ._constants import (
-    LLAMACLOUD_BASE_DELAY,
-    LLAMACLOUD_MAX_DELAY,
-    LLAMACLOUD_MAX_RETRIES,
-    LLAMACLOUD_RETRYABLE_EXCEPTIONS,
-    PER_PAGE_JOB_TIMEOUT,
-)
-from ._helpers import calculate_job_timeout, calculate_upload_timeout
-
-# ---------------------------------------------------------------------------
-# LlamaCloud parsing with retry
-# ---------------------------------------------------------------------------
-
-
-async def parse_with_llamacloud_retry(
-    file_path: str,
-    estimated_pages: int,
-    task_logger: TaskLoggingService | None = None,
-    log_entry: Log | None = None,
-):
-    """
-    Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
-
-    Uses dynamic timeout calculations based on file size and page count to handle
-    very large files reliably.
-
-    Returns:
-        LlamaParse result object
-
-    Raises:
-        Exception: If all retries fail
-    """
-    from llama_cloud_services import LlamaParse
-    from llama_cloud_services.parse.utils import ResultType
-
-    file_size_bytes = os.path.getsize(file_path)
-    file_size_mb = file_size_bytes / (1024 * 1024)
-
-    upload_timeout = calculate_upload_timeout(file_size_bytes)
-    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
-
-    custom_timeout = httpx.Timeout(
-        connect=120.0,
-        read=upload_timeout,
-        write=upload_timeout,
-        pool=120.0,
-    )
-
-    logging.info(
-        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
-        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
-        f"job_timeout={job_timeout:.0f}s"
-    )
-
-    last_exception = None
-    attempt_errors: list[str] = []
-
-    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
-        try:
-            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
-                parser = LlamaParse(
-                    api_key=app_config.LLAMA_CLOUD_API_KEY,
-                    num_workers=1,
-                    verbose=True,
-                    language="en",
-                    result_type=ResultType.MD,
-                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
-                    job_timeout_in_seconds=job_timeout,
-                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
-                    custom_client=custom_client,
-                )
-                result = await parser.aparse(file_path)
-
-                if attempt > 1:
-                    logging.info(
-                        f"LlamaCloud upload succeeded on attempt {attempt} after "
-                        f"{len(attempt_errors)} failures"
-                    )
-                return result
-
-        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
-            last_exception = e
-            error_type = type(e).__name__
-            error_msg = str(e)[:200]
-            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
-
-            if attempt < LLAMACLOUD_MAX_RETRIES:
-                base_delay = min(
-                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
-                    LLAMACLOUD_MAX_DELAY,
-                )
-                jitter = base_delay * 0.25 * (2 * random.random() - 1)
-                delay = base_delay + jitter
-
-                if task_logger and log_entry:
-                    await task_logger.log_task_progress(
-                        log_entry,
-                        f"LlamaCloud upload failed "
-                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
-                        f"retrying in {delay:.0f}s",
-                        {
-                            "error_type": error_type,
-                            "error_message": error_msg,
-                            "attempt": attempt,
-                            "retry_delay": delay,
-                            "file_size_mb": round(file_size_mb, 1),
-                            "upload_timeout": upload_timeout,
-                        },
-                    )
-                else:
-                    logging.warning(
-                        f"LlamaCloud upload failed "
-                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
-                        f"{error_type}. File: {file_size_mb:.1f}MB. "
-                        f"Retrying in {delay:.0f}s..."
-                    )
-
-                await asyncio.sleep(delay)
-            else:
-                logging.error(
-                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
-                    f"attempts. File size: {file_size_mb:.1f}MB, "
-                    f"Pages: {estimated_pages}. "
-                    f"Errors: {'; '.join(attempt_errors)}"
-                )
-
-        except Exception:
-            raise
-
-    raise last_exception or RuntimeError(
-        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
-        f"File size: {file_size_mb:.1f}MB"
-    )
-
-
-# ---------------------------------------------------------------------------
-# Per-service parse functions
-# ---------------------------------------------------------------------------
-
-
-async def parse_with_unstructured(file_path: str):
-    """
-    Parse a file using the Unstructured ETL service.
-
-    Returns:
-        List of LangChain Document elements.
-    """
-    from langchain_unstructured import UnstructuredLoader
-
-    loader = UnstructuredLoader(
-        file_path,
-        mode="elements",
-        post_processors=[],
-        languages=["eng"],
-        include_orig_elements=False,
-        include_metadata=False,
-        strategy="auto",
-    )
-    return await loader.aload()
-
-
-async def parse_with_docling(file_path: str, filename: str) -> str:
-    """
-    Parse a file using the Docling ETL service (via the Docling service wrapper).
-
-    Returns:
-        Markdown content string.
-    """
-    from app.services.docling_service import create_docling_service
-
-    docling_service = create_docling_service()
-
-    pdfminer_logger = getLogger("pdfminer")
-    original_level = pdfminer_logger.level
-
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
-        warnings.filterwarnings(
-            "ignore", message=".*Cannot set gray non-stroke color.*"
-        )
-        warnings.filterwarnings("ignore", message=".*invalid float value.*")
-        pdfminer_logger.setLevel(ERROR)
-
-        try:
-            result = await docling_service.process_document(file_path, filename)
-        finally:
-            pdfminer_logger.setLevel(original_level)
-
-    return result["content"]
diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py
index 7ac05932c..9cd7b87c9 100644
--- a/surfsense_backend/app/tasks/document_processors/_helpers.py
+++ b/surfsense_backend/app/tasks/document_processors/_helpers.py
@@ -11,13 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentStatus, DocumentType
 from app.utils.document_converters import generate_unique_identifier_hash
 
-from ._constants import (
-    BASE_JOB_TIMEOUT,
-    MAX_UPLOAD_TIMEOUT,
-    MIN_UPLOAD_TIMEOUT,
-    PER_PAGE_JOB_TIMEOUT,
-    UPLOAD_BYTES_PER_SECOND_SLOW,
-)
 from .base import (
     check_document_by_unique_identifier,
     check_duplicate_document,
@@ -198,21 +191,3 @@ async def update_document_from_connector(
     if "connector_id" in connector:
         document.connector_id = connector["connector_id"]
     await session.commit()
-
-
-# ---------------------------------------------------------------------------
-# Timeout calculations
-# ---------------------------------------------------------------------------
-
-
-def calculate_upload_timeout(file_size_bytes: int) -> float:
-    """Calculate upload timeout based on file size (conservative for slow connections)."""
-    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
-    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
-
-
-def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
-    """Calculate job processing timeout based on page count and file size."""
-    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
-    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
-    return max(page_based_timeout, size_based_timeout)
diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py
index 5088ad004..ae45f7a69 100644
--- a/surfsense_backend/app/tasks/document_processors/_save.py
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@@ -1,14 +1,9 @@
 """
 Unified document save/update logic for file processors.
-
-Replaces the three nearly-identical ``add_received_file_document_using_*``
-functions with a single ``save_file_document`` function plus thin wrappers
-for backward compatibility.
 """
 
 import logging
 
-from langchain_core.documents import Document as LangChainDocument
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 
@@ -207,79 +202,3 @@ async def save_file_document(
         raise RuntimeError(
             f"Failed to process file document using {etl_service}: {e!s}"
         ) from e
-
-
-# ---------------------------------------------------------------------------
-# Backward-compatible wrapper functions
-# ---------------------------------------------------------------------------
-
-
-async def add_received_file_document_using_unstructured(
-    session: AsyncSession,
-    file_name: str,
-    unstructured_processed_elements: list[LangChainDocument],
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """Process and store a file document using the Unstructured service."""
-    from app.utils.document_converters import convert_document_to_markdown
-
-    markdown_content = await convert_document_to_markdown(
-        unstructured_processed_elements
-    )
-    return await save_file_document(
-        session,
-        file_name,
-        markdown_content,
-        search_space_id,
-        user_id,
-        "UNSTRUCTURED",
-        connector,
-        enable_summary,
-    )
-
-
-async def add_received_file_document_using_llamacloud(
-    session: AsyncSession,
-    file_name: str,
-    llamacloud_markdown_document: str,
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """Process and store document content parsed by LlamaCloud."""
-    return await save_file_document(
-        session,
-        file_name,
-        llamacloud_markdown_document,
-        search_space_id,
-        user_id,
-        "LLAMACLOUD",
-        connector,
-        enable_summary,
-    )
-
-
-async def add_received_file_document_using_docling(
-    session: AsyncSession,
-    file_name: str,
-    docling_markdown_document: str,
-    search_space_id: int,
-    user_id: str,
-    connector: dict | None = None,
-    enable_summary: bool = True,
-) -> Document | None:
-    """Process and store document content parsed by Docling."""
-    return await save_file_document(
-        session,
-        file_name,
-        docling_markdown_document,
-        search_space_id,
-        user_id,
-        "DOCLING",
-        connector,
-        enable_summary,
-    )

From 8224360afa532300ffcd3afb7f4ea2627b253e99 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:30:29 +0530
Subject: [PATCH 07/37] refactor: unify file parsing logic across Dropbox,
 Google Drive, and OneDrive using the ETL pipeline

---
 .../connectors/dropbox/content_extractor.py   |   8 +-
 .../google_drive/content_extractor.py         | 102 ++--------------
 .../connectors/onedrive/content_extractor.py  | 110 ++----------------
 3 files changed, 21 insertions(+), 199 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py
index e89893b14..8e947eee7 100644
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@@ -87,9 +87,13 @@ async def download_and_extract_content(
         if error:
             return None, metadata, error
 
-        from app.connectors.onedrive.content_extractor import _parse_file_to_markdown
+        from app.etl_pipeline.etl_document import EtlRequest
+        from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        result = await EtlPipelineService().extract(
+            EtlRequest(file_path=temp_file_path, filename=file_name)
+        )
+        markdown = result.markdown_content
         return markdown, metadata, None
 
     except Exception as e:
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 1e94133b4..0c559fee9 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -1,12 +1,9 @@
 """Content extraction for Google Drive files."""
 
-import asyncio
 import contextlib
 import logging
 import os
 import tempfile
-import threading
-import time
 from pathlib import Path
 from typing import Any
 
@@ -110,99 +107,14 @@ async def download_and_extract_content(
 
 
 async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
-    """Parse a local file to markdown using the configured ETL service."""
-    lower = filename.lower()
+    """Parse a local file to markdown using the unified ETL pipeline."""
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    if lower.endswith((".md", ".markdown", ".txt")):
-        with open(file_path, encoding="utf-8") as f:
-            return f.read()
-
-    if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
-        from litellm import atranscription
-
-        from app.config import config as app_config
-
-        stt_service_type = (
-            "local"
-            if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
-            else "external"
-        )
-        if stt_service_type == "local":
-            from app.services.stt_service import stt_service
-
-            t0 = time.monotonic()
-            logger.info(
-                f"[local-stt] START file={filename} thread={threading.current_thread().name}"
-            )
-            result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
-            logger.info(
-                f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
-            )
-            text = result.get("text", "")
-        else:
-            with open(file_path, "rb") as audio_file:
-                kwargs: dict[str, Any] = {
-                    "model": app_config.STT_SERVICE,
-                    "file": audio_file,
-                    "api_key": app_config.STT_SERVICE_API_KEY,
-                }
-                if app_config.STT_SERVICE_API_BASE:
-                    kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
-                resp = await atranscription(**kwargs)
-                text = resp.get("text", "")
-
-        if not text:
-            raise ValueError("Transcription returned empty text")
-        return f"# Transcription of {filename}\n\n{text}"
-
-    # Document files -- use configured ETL service
-    from app.config import config as app_config
-
-    if app_config.ETL_SERVICE == "UNSTRUCTURED":
-        from langchain_unstructured import UnstructuredLoader
-
-        from app.utils.document_converters import convert_document_to_markdown
-
-        loader = UnstructuredLoader(
-            file_path,
-            mode="elements",
-            post_processors=[],
-            languages=["eng"],
-            include_orig_elements=False,
-            include_metadata=False,
-            strategy="auto",
-        )
-        docs = await loader.aload()
-        return await convert_document_to_markdown(docs)
-
-    if app_config.ETL_SERVICE == "LLAMACLOUD":
-        from app.tasks.document_processors.file_processors import (
-            parse_with_llamacloud_retry,
-        )
-
-        result = await parse_with_llamacloud_retry(
-            file_path=file_path, estimated_pages=50
-        )
-        markdown_documents = await result.aget_markdown_documents(split_by_page=False)
-        if not markdown_documents:
-            raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
-        return markdown_documents[0].text
-
-    if app_config.ETL_SERVICE == "DOCLING":
-        from docling.document_converter import DocumentConverter
-
-        converter = DocumentConverter()
-        t0 = time.monotonic()
-        logger.info(
-            f"[docling] START file={filename} thread={threading.current_thread().name}"
-        )
-        result = await asyncio.to_thread(converter.convert, file_path)
-        logger.info(
-            f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
-        )
-        return result.document.export_to_markdown()
-
-    raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=file_path, filename=filename)
+    )
+    return result.markdown_content
 
 
 async def download_and_process_file(
diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py
index 8917ba1fd..2355993eb 100644
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@@ -1,16 +1,9 @@
-"""Content extraction for OneDrive files.
+"""Content extraction for OneDrive files."""
 
-Reuses the same ETL parsing logic as Google Drive since file parsing is
-extension-based, not provider-specific.
-"""
-
-import asyncio
 import contextlib
 import logging
 import os
 import tempfile
-import threading
-import time
 from pathlib import Path
 from typing import Any
 
@@ -84,98 +77,11 @@ async def download_and_extract_content(
 
 
 async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
-    """Parse a local file to markdown using the configured ETL service.
+    """Parse a local file to markdown using the unified ETL pipeline."""
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    Same logic as Google Drive -- file parsing is extension-based.
-    """
-    lower = filename.lower()
-
-    if lower.endswith((".md", ".markdown", ".txt")):
-        with open(file_path, encoding="utf-8") as f:
-            return f.read()
-
-    if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
-        from litellm import atranscription
-
-        from app.config import config as app_config
-
-        stt_service_type = (
-            "local"
-            if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
-            else "external"
-        )
-        if stt_service_type == "local":
-            from app.services.stt_service import stt_service
-
-            t0 = time.monotonic()
-            logger.info(
-                f"[local-stt] START file={filename} thread={threading.current_thread().name}"
-            )
-            result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
-            logger.info(
-                f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
-            )
-            text = result.get("text", "")
-        else:
-            with open(file_path, "rb") as audio_file:
-                kwargs: dict[str, Any] = {
-                    "model": app_config.STT_SERVICE,
-                    "file": audio_file,
-                    "api_key": app_config.STT_SERVICE_API_KEY,
-                }
-                if app_config.STT_SERVICE_API_BASE:
-                    kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
-                resp = await atranscription(**kwargs)
-                text = resp.get("text", "")
-
-        if not text:
-            raise ValueError("Transcription returned empty text")
-        return f"# Transcription of {filename}\n\n{text}"
-
-    from app.config import config as app_config
-
-    if app_config.ETL_SERVICE == "UNSTRUCTURED":
-        from langchain_unstructured import UnstructuredLoader
-
-        from app.utils.document_converters import convert_document_to_markdown
-
-        loader = UnstructuredLoader(
-            file_path,
-            mode="elements",
-            post_processors=[],
-            languages=["eng"],
-            include_orig_elements=False,
-            include_metadata=False,
-            strategy="auto",
-        )
-        docs = await loader.aload()
-        return await convert_document_to_markdown(docs)
-
-    if app_config.ETL_SERVICE == "LLAMACLOUD":
-        from app.tasks.document_processors.file_processors import (
-            parse_with_llamacloud_retry,
-        )
-
-        result = await parse_with_llamacloud_retry(
-            file_path=file_path, estimated_pages=50
-        )
-        markdown_documents = await result.aget_markdown_documents(split_by_page=False)
-        if not markdown_documents:
-            raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
-        return markdown_documents[0].text
-
-    if app_config.ETL_SERVICE == "DOCLING":
-        from docling.document_converter import DocumentConverter
-
-        converter = DocumentConverter()
-        t0 = time.monotonic()
-        logger.info(
-            f"[docling] START file={filename} thread={threading.current_thread().name}"
-        )
-        result = await asyncio.to_thread(converter.convert, file_path)
-        logger.info(
-            f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
-        )
-        return result.document.export_to_markdown()
-
-    raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=file_path, filename=filename)
+    )
+    return result.markdown_content

From 87af012a60eee451e0af2311e3aa1547e6a6616e Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:45:18 +0530
Subject: [PATCH 08/37] refactor: streamline file processing by integrating ETL
 pipeline for all file types and removing redundant functions

---
 .../local_folder_indexer.py                   | 178 +---
 .../document_processors/file_processors.py    | 785 +++---------------
 2 files changed, 123 insertions(+), 840 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index acfbce0bf..749dbf731 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -44,132 +44,6 @@ from .base import (
     logger,
 )
 
-PLAINTEXT_EXTENSIONS = frozenset(
-    {
-        ".md",
-        ".markdown",
-        ".txt",
-        ".text",
-        ".json",
-        ".jsonl",
-        ".yaml",
-        ".yml",
-        ".toml",
-        ".ini",
-        ".cfg",
-        ".conf",
-        ".xml",
-        ".css",
-        ".scss",
-        ".less",
-        ".sass",
-        ".py",
-        ".pyw",
-        ".pyi",
-        ".pyx",
-        ".js",
-        ".jsx",
-        ".ts",
-        ".tsx",
-        ".mjs",
-        ".cjs",
-        ".java",
-        ".kt",
-        ".kts",
-        ".scala",
-        ".groovy",
-        ".c",
-        ".h",
-        ".cpp",
-        ".cxx",
-        ".cc",
-        ".hpp",
-        ".hxx",
-        ".cs",
-        ".fs",
-        ".fsx",
-        ".go",
-        ".rs",
-        ".rb",
-        ".php",
-        ".pl",
-        ".pm",
-        ".lua",
-        ".swift",
-        ".m",
-        ".mm",
-        ".r",
-        ".R",
-        ".jl",
-        ".sh",
-        ".bash",
-        ".zsh",
-        ".fish",
-        ".bat",
-        ".cmd",
-        ".ps1",
-        ".sql",
-        ".graphql",
-        ".gql",
-        ".env",
-        ".gitignore",
-        ".dockerignore",
-        ".editorconfig",
-        ".makefile",
-        ".cmake",
-        ".log",
-        ".rst",
-        ".tex",
-        ".bib",
-        ".org",
-        ".adoc",
-        ".asciidoc",
-        ".vue",
-        ".svelte",
-        ".astro",
-        ".tf",
-        ".hcl",
-        ".proto",
-    }
-)
-
-AUDIO_EXTENSIONS = frozenset(
-    {
-        ".mp3",
-        ".mp4",
-        ".mpeg",
-        ".mpga",
-        ".m4a",
-        ".wav",
-        ".webm",
-    }
-)
-
-
-DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
-
-
-def _is_plaintext_file(filename: str) -> bool:
-    return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS
-
-
-def _is_audio_file(filename: str) -> bool:
-    return Path(filename).suffix.lower() in AUDIO_EXTENSIONS
-
-
-def _is_direct_convert_file(filename: str) -> bool:
-    return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS
-
-
-def _needs_etl(filename: str) -> bool:
-    """File is not plaintext, not audio, and not direct-convert — requires ETL."""
-    return (
-        not _is_plaintext_file(filename)
-        and not _is_audio_file(filename)
-        and not _is_direct_convert_file(filename)
-    )
-
-
 HeartbeatCallbackType = Callable[[int], Awaitable[None]]
 
 
@@ -278,57 +152,21 @@ def scan_folder(
     return files
 
 
-def _read_plaintext_file(file_path: str) -> str:
-    """Read a plaintext/text-based file as UTF-8."""
-    with open(file_path, encoding="utf-8", errors="replace") as f:
-        content = f.read()
-    if "\x00" in content:
-        raise ValueError(
-            f"File contains null bytes — likely a binary file opened as text: {file_path}"
-        )
-    return content
 
 
 async def _read_file_content(file_path: str, filename: str) -> str:
-    """Read file content, using ETL for binary formats.
+    """Read file content via the unified ETL pipeline.
 
-    Plaintext files are read directly. Audio and document files (PDF, DOCX, etc.)
-    are routed through the configured ETL service (same as Google Drive / OneDrive).
-
-    Raises ValueError if the file cannot be parsed (e.g. no ETL service configured
-    for a binary file).
+    All file types (plaintext, audio, direct-convert, document) are handled
+    by ``EtlPipelineService``.
     """
-    if _is_plaintext_file(filename):
-        return _read_plaintext_file(file_path)
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    if _is_direct_convert_file(filename):
-        from app.tasks.document_processors._direct_converters import (
-            convert_file_directly,
-        )
-
-        return convert_file_directly(file_path, filename)
-
-    if _is_audio_file(filename):
-        etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None
-        stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None
-        if not stt_service_val and not etl_service:
-            raise ValueError(
-                f"No STT_SERVICE configured — cannot transcribe audio file: {filename}"
-            )
-
-    if _needs_etl(filename):
-        etl_service = getattr(config, "ETL_SERVICE", None)
-        if not etl_service:
-            raise ValueError(
-                f"No ETL_SERVICE configured — cannot parse binary file: {filename}. "
-                f"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
-            )
-
-    from app.connectors.onedrive.content_extractor import (
-        _parse_file_to_markdown,
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=file_path, filename=filename)
     )
-
-    return await _parse_file_to_markdown(file_path, filename)
+    return result.markdown_content
 
 
 def _content_hash(content: str, search_space_id: int) -> str:
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 0c1cad52d..f54a963ad 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -1,14 +1,8 @@
 """
 File document processors orchestrating content extraction and indexing.
 
-This module is the public entry point for file processing.  It delegates to
-specialised sub-modules that each own a single concern:
-
-- ``_constants``          — file type classification and configuration constants
-- ``_helpers``            — document deduplication, migration, connector helpers
-- ``_direct_converters``  — lossless file-to-markdown for csv/tsv/html
-- ``_etl``               — ETL parsing strategies (Unstructured, LlamaCloud, Docling)
-- ``_save``              — unified document creation / update logic
+Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and
+keeps only orchestration concerns (notifications, logging, page limits, saving).
 """
 
 from __future__ import annotations
@@ -17,38 +11,19 @@ import contextlib
 import logging
 import os
 from dataclasses import dataclass, field
-from logging import ERROR, getLogger
 
 from fastapi import HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.config import config as app_config
 from app.db import Document, Log, Notification
 from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
 
-from ._constants import FileCategory, classify_file
-from ._direct_converters import convert_file_directly
-from ._etl import (
-    parse_with_docling,
-    parse_with_llamacloud_retry,
-    parse_with_unstructured,
-)
 from ._helpers import update_document_from_connector
-from ._save import (
-    add_received_file_document_using_docling,
-    add_received_file_document_using_llamacloud,
-    add_received_file_document_using_unstructured,
-    save_file_document,
-)
+from ._save import save_file_document
 from .markdown_processor import add_received_markdown_file_document
 
-# Re-export public API so existing ``from file_processors import …`` keeps working.
 __all__ = [
-    "add_received_file_document_using_docling",
-    "add_received_file_document_using_llamacloud",
-    "add_received_file_document_using_unstructured",
-    "parse_with_llamacloud_retry",
     "process_file_in_background",
     "process_file_in_background_with_document",
     "save_file_document",
@@ -142,35 +117,31 @@ async def _log_page_divergence(
 # ===================================================================
 
 
-async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
-    """Read a markdown / text file and create or update a document."""
-    await _notify(ctx, "parsing", "Reading file")
+async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
+    """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
+
+    await _notify(ctx, "parsing", "Processing file")
     await ctx.task_logger.log_task_progress(
         ctx.log_entry,
-        f"Processing markdown/text file: {ctx.filename}",
-        {"file_type": "markdown", "processing_stage": "reading_file"},
+        f"Processing file: {ctx.filename}",
+        {"processing_stage": "extracting"},
     )
 
-    with open(ctx.file_path, encoding="utf-8") as f:
-        markdown_content = f.read()
+    etl_result = await EtlPipelineService().extract(
+        EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
+    )
 
     with contextlib.suppress(Exception):
         os.unlink(ctx.file_path)
 
     await _notify(ctx, "chunking")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Creating document from markdown content: {ctx.filename}",
-        {
-            "processing_stage": "creating_document",
-            "content_length": len(markdown_content),
-        },
-    )
 
     result = await add_received_markdown_file_document(
         ctx.session,
         ctx.filename,
-        markdown_content,
+        etl_result.markdown_content,
         ctx.search_space_id,
         ctx.user_id,
         ctx.connector,
@@ -181,179 +152,19 @@ async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
     if result:
         await ctx.task_logger.log_task_success(
             ctx.log_entry,
-            f"Successfully processed markdown file: {ctx.filename}",
+            f"Successfully processed file: {ctx.filename}",
             {
                 "document_id": result.id,
                 "content_hash": result.content_hash,
-                "file_type": "markdown",
+                "file_type": etl_result.content_type,
+                "etl_service": etl_result.etl_service,
             },
         )
     else:
         await ctx.task_logger.log_task_success(
             ctx.log_entry,
-            f"Markdown file already exists (duplicate): {ctx.filename}",
-            {"duplicate_detected": True, "file_type": "markdown"},
-        )
-    return result
-
-
-async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
-    """Convert a text-based file (csv/tsv/html) to markdown without ETL."""
-    await _notify(ctx, "parsing", "Converting file")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Direct-converting file to markdown: {ctx.filename}",
-        {"file_type": "direct_convert", "processing_stage": "converting"},
-    )
-
-    markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
-
-    with contextlib.suppress(Exception):
-        os.unlink(ctx.file_path)
-
-    await _notify(ctx, "chunking")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Creating document from converted content: {ctx.filename}",
-        {
-            "processing_stage": "creating_document",
-            "content_length": len(markdown_content),
-        },
-    )
-
-    result = await add_received_markdown_file_document(
-        ctx.session,
-        ctx.filename,
-        markdown_content,
-        ctx.search_space_id,
-        ctx.user_id,
-        ctx.connector,
-    )
-    if ctx.connector:
-        await update_document_from_connector(result, ctx.connector, ctx.session)
-
-    if result:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Successfully direct-converted file: {ctx.filename}",
-            {
-                "document_id": result.id,
-                "content_hash": result.content_hash,
-                "file_type": "direct_convert",
-            },
-        )
-    else:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Direct-converted file already exists (duplicate): {ctx.filename}",
-            {"duplicate_detected": True, "file_type": "direct_convert"},
-        )
-    return result
-
-
-async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
-    """Transcribe an audio file and create or update a document."""
-    await _notify(ctx, "parsing", "Transcribing audio")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Processing audio file for transcription: {ctx.filename}",
-        {"file_type": "audio", "processing_stage": "starting_transcription"},
-    )
-
-    stt_service_type = (
-        "local"
-        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
-        else "external"
-    )
-
-    if stt_service_type == "local":
-        from app.services.stt_service import stt_service
-
-        try:
-            stt_result = stt_service.transcribe_file(ctx.file_path)
-            transcribed_text = stt_result.get("text", "")
-            if not transcribed_text:
-                raise ValueError("Transcription returned empty text")
-            transcribed_text = (
-                f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
-            )
-        except Exception as e:
-            raise HTTPException(
-                status_code=422,
-                detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
-            ) from e
-
-        await ctx.task_logger.log_task_progress(
-            ctx.log_entry,
-            f"Local STT transcription completed: {ctx.filename}",
-            {
-                "processing_stage": "local_transcription_complete",
-                "language": stt_result.get("language"),
-                "confidence": stt_result.get("language_probability"),
-                "duration": stt_result.get("duration"),
-            },
-        )
-    else:
-        from litellm import atranscription
-
-        with open(ctx.file_path, "rb") as audio_file:
-            transcription_kwargs: dict = {
-                "model": app_config.STT_SERVICE,
-                "file": audio_file,
-                "api_key": app_config.STT_SERVICE_API_KEY,
-            }
-            if app_config.STT_SERVICE_API_BASE:
-                transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
-
-            transcription_response = await atranscription(**transcription_kwargs)
-            transcribed_text = transcription_response.get("text", "")
-            if not transcribed_text:
-                raise ValueError("Transcription returned empty text")
-
-        transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
-
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Transcription completed, creating document: {ctx.filename}",
-        {
-            "processing_stage": "transcription_complete",
-            "transcript_length": len(transcribed_text),
-        },
-    )
-
-    await _notify(ctx, "chunking")
-
-    with contextlib.suppress(Exception):
-        os.unlink(ctx.file_path)
-
-    result = await add_received_markdown_file_document(
-        ctx.session,
-        ctx.filename,
-        transcribed_text,
-        ctx.search_space_id,
-        ctx.user_id,
-        ctx.connector,
-    )
-    if ctx.connector:
-        await update_document_from_connector(result, ctx.connector, ctx.session)
-
-    if result:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Successfully transcribed and processed audio file: {ctx.filename}",
-            {
-                "document_id": result.id,
-                "content_hash": result.content_hash,
-                "file_type": "audio",
-                "transcript_length": len(transcribed_text),
-                "stt_service": stt_service_type,
-            },
-        )
-    else:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Audio file transcript already exists (duplicate): {ctx.filename}",
-            {"duplicate_detected": True, "file_type": "audio"},
+            f"File already exists (duplicate): {ctx.filename}",
+            {"duplicate_detected": True, "file_type": etl_result.content_type},
         )
     return result
 
@@ -363,279 +174,10 @@ async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
 # ---------------------------------------------------------------------------
 
 
-async def _etl_unstructured(
-    ctx: _ProcessingContext,
-    page_limit_service,
-    estimated_pages: int,
-) -> Document | None:
-    """Parse and save via the Unstructured ETL service."""
-    await _notify(ctx, "parsing", "Extracting content")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Processing file with Unstructured ETL: {ctx.filename}",
-        {
-            "file_type": "document",
-            "etl_service": "UNSTRUCTURED",
-            "processing_stage": "loading",
-        },
-    )
-
-    docs = await parse_with_unstructured(ctx.file_path)
-
-    await _notify(ctx, "chunking", chunks_count=len(docs))
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Unstructured ETL completed, creating document: {ctx.filename}",
-        {"processing_stage": "etl_complete", "elements_count": len(docs)},
-    )
-
-    actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-    final_pages = max(estimated_pages, actual_pages)
-    await _log_page_divergence(
-        ctx.task_logger,
-        ctx.log_entry,
-        ctx.filename,
-        estimated_pages,
-        actual_pages,
-        final_pages,
-    )
-
-    with contextlib.suppress(Exception):
-        os.unlink(ctx.file_path)
-
-    result = await add_received_file_document_using_unstructured(
-        ctx.session,
-        ctx.filename,
-        docs,
-        ctx.search_space_id,
-        ctx.user_id,
-        ctx.connector,
-        enable_summary=ctx.enable_summary,
-    )
-    if ctx.connector:
-        await update_document_from_connector(result, ctx.connector, ctx.session)
-
-    if result:
-        await page_limit_service.update_page_usage(
-            ctx.user_id, final_pages, allow_exceed=True
-        )
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Successfully processed file with Unstructured: {ctx.filename}",
-            {
-                "document_id": result.id,
-                "content_hash": result.content_hash,
-                "file_type": "document",
-                "etl_service": "UNSTRUCTURED",
-                "pages_processed": final_pages,
-            },
-        )
-    else:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Document already exists (duplicate): {ctx.filename}",
-            {
-                "duplicate_detected": True,
-                "file_type": "document",
-                "etl_service": "UNSTRUCTURED",
-            },
-        )
-    return result
-
-
-async def _etl_llamacloud(
-    ctx: _ProcessingContext,
-    page_limit_service,
-    estimated_pages: int,
-) -> Document | None:
-    """Parse and save via the LlamaCloud ETL service."""
-    await _notify(ctx, "parsing", "Extracting content")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Processing file with LlamaCloud ETL: {ctx.filename}",
-        {
-            "file_type": "document",
-            "etl_service": "LLAMACLOUD",
-            "processing_stage": "parsing",
-            "estimated_pages": estimated_pages,
-        },
-    )
-
-    raw_result = await parse_with_llamacloud_retry(
-        file_path=ctx.file_path,
-        estimated_pages=estimated_pages,
-        task_logger=ctx.task_logger,
-        log_entry=ctx.log_entry,
-    )
-
-    with contextlib.suppress(Exception):
-        os.unlink(ctx.file_path)
-
-    markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
-
-    await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
-        {
-            "processing_stage": "parsing_complete",
-            "documents_count": len(markdown_documents),
-        },
-    )
-
-    if not markdown_documents:
-        await ctx.task_logger.log_task_failure(
-            ctx.log_entry,
-            f"LlamaCloud parsing returned no documents: {ctx.filename}",
-            "ETL service returned empty document list",
-            {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
-        )
-        raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
-
-    actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
-    final_pages = max(estimated_pages, actual_pages)
-    await _log_page_divergence(
-        ctx.task_logger,
-        ctx.log_entry,
-        ctx.filename,
-        estimated_pages,
-        actual_pages,
-        final_pages,
-    )
-
-    any_created = False
-    last_doc: Document | None = None
-
-    for doc in markdown_documents:
-        doc_result = await add_received_file_document_using_llamacloud(
-            ctx.session,
-            ctx.filename,
-            llamacloud_markdown_document=doc.text,
-            search_space_id=ctx.search_space_id,
-            user_id=ctx.user_id,
-            connector=ctx.connector,
-            enable_summary=ctx.enable_summary,
-        )
-        if doc_result:
-            any_created = True
-            last_doc = doc_result
-
-    if any_created:
-        await page_limit_service.update_page_usage(
-            ctx.user_id, final_pages, allow_exceed=True
-        )
-        if ctx.connector:
-            await update_document_from_connector(last_doc, ctx.connector, ctx.session)
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Successfully processed file with LlamaCloud: {ctx.filename}",
-            {
-                "document_id": last_doc.id,
-                "content_hash": last_doc.content_hash,
-                "file_type": "document",
-                "etl_service": "LLAMACLOUD",
-                "pages_processed": final_pages,
-                "documents_count": len(markdown_documents),
-            },
-        )
-        return last_doc
-
-    await ctx.task_logger.log_task_success(
-        ctx.log_entry,
-        f"Document already exists (duplicate): {ctx.filename}",
-        {
-            "duplicate_detected": True,
-            "file_type": "document",
-            "etl_service": "LLAMACLOUD",
-            "documents_count": len(markdown_documents),
-        },
-    )
-    return None
-
-
-async def _etl_docling(
-    ctx: _ProcessingContext,
-    page_limit_service,
-    estimated_pages: int,
-) -> Document | None:
-    """Parse and save via the Docling ETL service."""
-    await _notify(ctx, "parsing", "Extracting content")
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Processing file with Docling ETL: {ctx.filename}",
-        {
-            "file_type": "document",
-            "etl_service": "DOCLING",
-            "processing_stage": "parsing",
-        },
-    )
-
-    content = await parse_with_docling(ctx.file_path, ctx.filename)
-
-    with contextlib.suppress(Exception):
-        os.unlink(ctx.file_path)
-
-    await ctx.task_logger.log_task_progress(
-        ctx.log_entry,
-        f"Docling parsing completed, creating document: {ctx.filename}",
-        {"processing_stage": "parsing_complete", "content_length": len(content)},
-    )
-
-    actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
-    final_pages = max(estimated_pages, actual_pages)
-    await _log_page_divergence(
-        ctx.task_logger,
-        ctx.log_entry,
-        ctx.filename,
-        estimated_pages,
-        actual_pages,
-        final_pages,
-    )
-
-    await _notify(ctx, "chunking")
-
-    result = await add_received_file_document_using_docling(
-        ctx.session,
-        ctx.filename,
-        docling_markdown_document=content,
-        search_space_id=ctx.search_space_id,
-        user_id=ctx.user_id,
-        connector=ctx.connector,
-        enable_summary=ctx.enable_summary,
-    )
-
-    if result:
-        await page_limit_service.update_page_usage(
-            ctx.user_id, final_pages, allow_exceed=True
-        )
-        if ctx.connector:
-            await update_document_from_connector(result, ctx.connector, ctx.session)
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Successfully processed file with Docling: {ctx.filename}",
-            {
-                "document_id": result.id,
-                "content_hash": result.content_hash,
-                "file_type": "document",
-                "etl_service": "DOCLING",
-                "pages_processed": final_pages,
-            },
-        )
-    else:
-        await ctx.task_logger.log_task_success(
-            ctx.log_entry,
-            f"Document already exists (duplicate): {ctx.filename}",
-            {
-                "duplicate_detected": True,
-                "file_type": "document",
-                "etl_service": "DOCLING",
-            },
-        )
-    return result
-
-
 async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Route a document file to the configured ETL service."""
+    """Route a document file to the configured ETL service via the unified pipeline."""
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
     from app.services.page_limit_service import PageLimitExceededError, PageLimitService
 
     page_limit_service = PageLimitService(ctx.session)
@@ -665,16 +207,60 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
             os.unlink(ctx.file_path)
         raise HTTPException(status_code=403, detail=str(e)) from e
 
-    etl_dispatch = {
-        "UNSTRUCTURED": _etl_unstructured,
-        "LLAMACLOUD": _etl_llamacloud,
-        "DOCLING": _etl_docling,
-    }
-    handler = etl_dispatch.get(app_config.ETL_SERVICE)
-    if handler is None:
-        raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+    await _notify(ctx, "parsing", "Extracting content")
 
-    return await handler(ctx, page_limit_service, estimated_pages)
+    etl_result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=ctx.file_path,
+            filename=ctx.filename,
+            estimated_pages=estimated_pages,
+        )
+    )
+
+    with contextlib.suppress(Exception):
+        os.unlink(ctx.file_path)
+
+    await _notify(ctx, "chunking")
+
+    result = await save_file_document(
+        ctx.session,
+        ctx.filename,
+        etl_result.markdown_content,
+        ctx.search_space_id,
+        ctx.user_id,
+        etl_result.etl_service,
+        ctx.connector,
+        enable_summary=ctx.enable_summary,
+    )
+
+    if result:
+        await page_limit_service.update_page_usage(
+            ctx.user_id, estimated_pages, allow_exceed=True
+        )
+        if ctx.connector:
+            await update_document_from_connector(result, ctx.connector, ctx.session)
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Successfully processed file: {ctx.filename}",
+            {
+                "document_id": result.id,
+                "content_hash": result.content_hash,
+                "file_type": "document",
+                "etl_service": etl_result.etl_service,
+                "pages_processed": estimated_pages,
+            },
+        )
+    else:
+        await ctx.task_logger.log_task_success(
+            ctx.log_entry,
+            f"Document already exists (duplicate): {ctx.filename}",
+            {
+                "duplicate_detected": True,
+                "file_type": "document",
+                "etl_service": etl_result.etl_service,
+            },
+        )
+    return result
 
 
 # ===================================================================
@@ -706,15 +292,14 @@ async def process_file_in_background(
     )
 
     try:
-        category = classify_file(filename)
+        from app.etl_pipeline.file_classifier import FileCategory as EtlFileCategory
+        from app.etl_pipeline.file_classifier import classify_file as etl_classify
 
-        if category == FileCategory.MARKDOWN:
-            return await _process_markdown_upload(ctx)
-        if category == FileCategory.DIRECT_CONVERT:
-            return await _process_direct_convert_upload(ctx)
-        if category == FileCategory.AUDIO:
-            return await _process_audio_upload(ctx)
-        return await _process_document_upload(ctx)
+        category = etl_classify(filename)
+
+        if category == EtlFileCategory.DOCUMENT:
+            return await _process_document_upload(ctx)
+        return await _process_non_document_upload(ctx)
 
     except Exception as e:
         await session.rollback()
@@ -758,201 +343,61 @@ async def _extract_file_content(
     Returns:
         Tuple of (markdown_content, etl_service_name).
     """
-    category = classify_file(filename)
+    from app.etl_pipeline.etl_document import EtlRequest
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
+    from app.etl_pipeline.file_classifier import FileCategory
+    from app.etl_pipeline.file_classifier import classify_file as etl_classify
 
-    if category == FileCategory.MARKDOWN:
-        if notification:
-            await NotificationService.document_processing.notify_processing_progress(
-                session,
-                notification,
-                stage="parsing",
-                stage_message="Reading file",
-            )
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Processing markdown/text file: {filename}",
-            {"file_type": "markdown", "processing_stage": "reading_file"},
-        )
-        with open(file_path, encoding="utf-8") as f:
-            content = f.read()
-        with contextlib.suppress(Exception):
-            os.unlink(file_path)
-        return content, "MARKDOWN"
-
-    if category == FileCategory.DIRECT_CONVERT:
-        if notification:
-            await NotificationService.document_processing.notify_processing_progress(
-                session,
-                notification,
-                stage="parsing",
-                stage_message="Converting file",
-            )
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Direct-converting file to markdown: {filename}",
-            {"file_type": "direct_convert", "processing_stage": "converting"},
-        )
-        content = convert_file_directly(file_path, filename)
-        with contextlib.suppress(Exception):
-            os.unlink(file_path)
-        return content, "DIRECT_CONVERT"
-
-    if category == FileCategory.AUDIO:
-        if notification:
-            await NotificationService.document_processing.notify_processing_progress(
-                session,
-                notification,
-                stage="parsing",
-                stage_message="Transcribing audio",
-            )
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Processing audio file for transcription: {filename}",
-            {"file_type": "audio", "processing_stage": "starting_transcription"},
-        )
-        transcribed_text = await _transcribe_audio(file_path, filename)
-        with contextlib.suppress(Exception):
-            os.unlink(file_path)
-        return transcribed_text, "AUDIO_TRANSCRIPTION"
-
-    # Document file — use ETL service
-    return await _extract_document_content(
-        file_path,
-        filename,
-        session,
-        user_id,
-        task_logger,
-        log_entry,
-        notification,
-    )
-
-
-async def _transcribe_audio(file_path: str, filename: str) -> str:
-    """Transcribe an audio file and return formatted markdown text."""
-    stt_service_type = (
-        "local"
-        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
-        else "external"
-    )
-
-    if stt_service_type == "local":
-        from app.services.stt_service import stt_service
-
-        result = stt_service.transcribe_file(file_path)
-        text = result.get("text", "")
-        if not text:
-            raise ValueError("Transcription returned empty text")
-    else:
-        from litellm import atranscription
-
-        with open(file_path, "rb") as audio_file:
-            kwargs: dict = {
-                "model": app_config.STT_SERVICE,
-                "file": audio_file,
-                "api_key": app_config.STT_SERVICE_API_KEY,
-            }
-            if app_config.STT_SERVICE_API_BASE:
-                kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
-            response = await atranscription(**kwargs)
-            text = response.get("text", "")
-            if not text:
-                raise ValueError("Transcription returned empty text")
-
-    return f"# Transcription of {filename}\n\n{text}"
-
-
-async def _extract_document_content(
-    file_path: str,
-    filename: str,
-    session: AsyncSession,
-    user_id: str,
-    task_logger: TaskLoggingService,
-    log_entry: Log,
-    notification: Notification | None,
-) -> tuple[str, str]:
-    """
-    Parse a document file via the configured ETL service.
-
-    Returns:
-        Tuple of (markdown_content, etl_service_name).
-    """
-    from app.services.page_limit_service import PageLimitService
-
-    page_limit_service = PageLimitService(session)
-
-    try:
-        estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
-    except Exception:
-        file_size = os.path.getsize(file_path)
-        estimated_pages = max(1, file_size // (80 * 1024))
-
-    await page_limit_service.check_page_limit(user_id, estimated_pages)
-
-    etl_service = app_config.ETL_SERVICE
-    markdown_content: str | None = None
+    category = etl_classify(filename)
+    estimated_pages = 0
 
     if notification:
+        stage_messages = {
+            FileCategory.PLAINTEXT: "Reading file",
+            FileCategory.DIRECT_CONVERT: "Converting file",
+            FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.DOCUMENT: "Extracting content",
+        }
         await NotificationService.document_processing.notify_processing_progress(
             session,
             notification,
             stage="parsing",
-            stage_message="Extracting content",
+            stage_message=stage_messages.get(category, "Processing"),
         )
 
-    if etl_service == "UNSTRUCTURED":
-        from app.utils.document_converters import convert_document_to_markdown
+    await task_logger.log_task_progress(
+        log_entry,
+        f"Processing {category.value} file: {filename}",
+        {"file_type": category.value, "processing_stage": "extracting"},
+    )
 
-        docs = await parse_with_unstructured(file_path)
-        markdown_content = await convert_document_to_markdown(docs)
-        actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-        final_pages = max(estimated_pages, actual_pages)
-        await page_limit_service.update_page_usage(
-            user_id, final_pages, allow_exceed=True
-        )
+    if category == FileCategory.DOCUMENT:
+        from app.services.page_limit_service import PageLimitService
 
-    elif etl_service == "LLAMACLOUD":
-        raw_result = await parse_with_llamacloud_retry(
+        page_limit_service = PageLimitService(session)
+        estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
+        await page_limit_service.check_page_limit(user_id, estimated_pages)
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
             file_path=file_path,
+            filename=filename,
             estimated_pages=estimated_pages,
-            task_logger=task_logger,
-            log_entry=log_entry,
         )
-        markdown_documents = await raw_result.aget_markdown_documents(
-            split_by_page=False
-        )
-        if not markdown_documents:
-            raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
-        markdown_content = markdown_documents[0].text
+    )
+
+    if category == FileCategory.DOCUMENT:
         await page_limit_service.update_page_usage(
             user_id, estimated_pages, allow_exceed=True
         )
 
-    elif etl_service == "DOCLING":
-        getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
-        getLogger("docling.document_converter").setLevel(ERROR)
-        getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
-            ERROR
-        )
-
-        from docling.document_converter import DocumentConverter
-
-        converter = DocumentConverter()
-        result = converter.convert(file_path)
-        markdown_content = result.document.export_to_markdown()
-        await page_limit_service.update_page_usage(
-            user_id, estimated_pages, allow_exceed=True
-        )
-
-    else:
-        raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
-
     with contextlib.suppress(Exception):
         os.unlink(file_path)
 
-    if not markdown_content:
+    if not result.markdown_content:
         raise RuntimeError(f"Failed to extract content from file: {filename}")
 
-    return markdown_content, etl_service
+    return result.markdown_content, result.etl_service
 
 
 async def process_file_in_background_with_document(

From f8913adaa30eadd5407c8286c726bda783fe44a9 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:46:04 +0530
Subject: [PATCH 09/37] test: add unit tests for content extraction from cloud
 connectors and ETL pipeline functionality

---
 .../test_content_extraction.py                | 244 ++++++++++++++
 .../tests/unit/etl_pipeline/conftest.py       |  29 ++
 .../etl_pipeline/test_etl_pipeline_service.py | 309 ++++++++++++++++++
 3 files changed, 582 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
 create mode 100644 surfsense_backend/tests/unit/etl_pipeline/conftest.py
 create mode 100644 surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py

diff --git a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
new file mode 100644
index 000000000..49f9a217a
--- /dev/null
+++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
@@ -0,0 +1,244 @@
+"""Tests that each cloud connector's download_and_extract_content correctly
+produces markdown from a real file via the unified ETL pipeline.
+
+Only the cloud client is mocked (system boundary).  The ETL pipeline runs for
+real so we know the full path from "cloud gives us bytes" to "we get markdown
+back" actually works.
+"""
+
+import os
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+_TXT_CONTENT = "Hello from the cloud connector test."
+_CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+async def _write_file(dest_path: str, content: str) -> None:
+    """Simulate a cloud client writing downloaded bytes to disk."""
+    with open(dest_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def _make_download_side_effect(content: str):
+    """Return an async side-effect that writes *content* to the dest path
+    and returns ``None`` (success)."""
+
+    async def _side_effect(*args):
+        dest_path = args[-1]
+        await _write_file(dest_path, content)
+        return None
+
+    return _side_effect
+
+
+# ===================================================================
+# Google Drive
+# ===================================================================
+
+class TestGoogleDriveContentExtraction:
+
+    async def test_txt_file_returns_markdown(self):
+        from app.connectors.google_drive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_TXT_CONTENT),
+        )
+
+        file = {"id": "f1", "name": "notes.txt", "mimeType": "text/plain"}
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert _TXT_CONTENT in markdown
+        assert metadata["google_drive_file_id"] == "f1"
+        assert metadata["google_drive_file_name"] == "notes.txt"
+
+    async def test_csv_file_returns_markdown_table(self):
+        from app.connectors.google_drive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_CSV_CONTENT),
+        )
+
+        file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert "Alice" in markdown
+        assert "Bob" in markdown
+        assert "|" in markdown
+
+    async def test_download_error_returns_error_message(self):
+        from app.connectors.google_drive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(return_value="Network timeout")
+
+        file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert markdown is None
+        assert error == "Network timeout"
+
+
+# ===================================================================
+# OneDrive
+# ===================================================================
+
+class TestOneDriveContentExtraction:
+
+    async def test_txt_file_returns_markdown(self):
+        from app.connectors.onedrive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_TXT_CONTENT),
+        )
+
+        file = {
+            "id": "od-1",
+            "name": "report.txt",
+            "file": {"mimeType": "text/plain"},
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert _TXT_CONTENT in markdown
+        assert metadata["onedrive_file_id"] == "od-1"
+        assert metadata["onedrive_file_name"] == "report.txt"
+
+    async def test_csv_file_returns_markdown_table(self):
+        from app.connectors.onedrive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_CSV_CONTENT),
+        )
+
+        file = {
+            "id": "od-2",
+            "name": "data.csv",
+            "file": {"mimeType": "text/csv"},
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert "Alice" in markdown
+        assert "|" in markdown
+
+    async def test_download_error_returns_error_message(self):
+        from app.connectors.onedrive.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(return_value="403 Forbidden")
+
+        file = {
+            "id": "od-3",
+            "name": "secret.txt",
+            "file": {"mimeType": "text/plain"},
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert markdown is None
+        assert error == "403 Forbidden"
+
+
+# ===================================================================
+# Dropbox
+# ===================================================================
+
+class TestDropboxContentExtraction:
+
+    async def test_txt_file_returns_markdown(self):
+        from app.connectors.dropbox.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_TXT_CONTENT),
+        )
+
+        file = {
+            "id": "dbx-1",
+            "name": "memo.txt",
+            ".tag": "file",
+            "path_lower": "/memo.txt",
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert _TXT_CONTENT in markdown
+        assert metadata["dropbox_file_id"] == "dbx-1"
+        assert metadata["dropbox_file_name"] == "memo.txt"
+
+    async def test_csv_file_returns_markdown_table(self):
+        from app.connectors.dropbox.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(
+            side_effect=_make_download_side_effect(_CSV_CONTENT),
+        )
+
+        file = {
+            "id": "dbx-2",
+            "name": "data.csv",
+            ".tag": "file",
+            "path_lower": "/data.csv",
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert error is None
+        assert "Alice" in markdown
+        assert "|" in markdown
+
+    async def test_download_error_returns_error_message(self):
+        from app.connectors.dropbox.content_extractor import (
+            download_and_extract_content,
+        )
+
+        client = MagicMock()
+        client.download_file_to_disk = AsyncMock(return_value="Rate limited")
+
+        file = {
+            "id": "dbx-3",
+            "name": "big.txt",
+            ".tag": "file",
+            "path_lower": "/big.txt",
+        }
+
+        markdown, metadata, error = await download_and_extract_content(client, file)
+
+        assert markdown is None
+        assert error == "Rate limited"
diff --git a/surfsense_backend/tests/unit/etl_pipeline/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/conftest.py
new file mode 100644
index 000000000..6059caa01
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py
@@ -0,0 +1,29 @@
+"""Pre-register the etl_pipeline package to avoid circular imports during unit tests."""
+
+import sys
+import types
+from pathlib import Path
+
+_BACKEND = Path(__file__).resolve().parents[3]
+
+
+def _stub_package(dotted: str, fs_dir: Path) -> None:
+    if dotted not in sys.modules:
+        mod = types.ModuleType(dotted)
+        mod.__path__ = [str(fs_dir)]
+        mod.__package__ = dotted
+        sys.modules[dotted] = mod
+
+    parts = dotted.split(".")
+    if len(parts) > 1:
+        parent_dotted = ".".join(parts[:-1])
+        parent = sys.modules.get(parent_dotted)
+        if parent is not None:
+            setattr(parent, parts[-1], sys.modules[dotted])
+
+
+_stub_package("app", _BACKEND / "app")
+_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
+_stub_package(
+    "app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers"
+)
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
new file mode 100644
index 000000000..0d31507ca
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -0,0 +1,309 @@
+"""Tests for EtlPipelineService -- the unified ETL pipeline public interface."""
+
+import pytest
+
+from app.etl_pipeline.etl_document import EtlRequest
+from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
+
+pytestmark = pytest.mark.unit
+
+
+async def test_extract_txt_file_returns_markdown(tmp_path):
+    """Tracer bullet: a .txt file is read and returned as-is in an EtlResult."""
+    txt_file = tmp_path / "hello.txt"
+    txt_file.write_text("Hello, world!", encoding="utf-8")
+
+    service = EtlPipelineService()
+    result = await service.extract(
+        EtlRequest(file_path=str(txt_file), filename="hello.txt")
+    )
+
+    assert result.markdown_content == "Hello, world!"
+    assert result.etl_service == "PLAINTEXT"
+    assert result.content_type == "plaintext"
+
+
+async def test_extract_md_file(tmp_path):
+    """A .md file is classified as PLAINTEXT and extracted."""
+    md_file = tmp_path / "readme.md"
+    md_file.write_text("# Title\n\nBody text.", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(md_file), filename="readme.md")
+    )
+
+    assert result.markdown_content == "# Title\n\nBody text."
+    assert result.etl_service == "PLAINTEXT"
+    assert result.content_type == "plaintext"
+
+
+async def test_extract_markdown_file(tmp_path):
+    """A .markdown file is classified as PLAINTEXT and extracted."""
+    md_file = tmp_path / "notes.markdown"
+    md_file.write_text("Some notes.", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(md_file), filename="notes.markdown")
+    )
+
+    assert result.markdown_content == "Some notes."
+    assert result.etl_service == "PLAINTEXT"
+
+
+async def test_extract_python_file(tmp_path):
+    """A .py source code file is classified as PLAINTEXT."""
+    py_file = tmp_path / "script.py"
+    py_file.write_text("print('hello')", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(py_file), filename="script.py")
+    )
+
+    assert result.markdown_content == "print('hello')"
+    assert result.etl_service == "PLAINTEXT"
+    assert result.content_type == "plaintext"
+
+
+async def test_extract_js_file(tmp_path):
+    """A .js source code file is classified as PLAINTEXT."""
+    js_file = tmp_path / "app.js"
+    js_file.write_text("console.log('hi');", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(js_file), filename="app.js")
+    )
+
+    assert result.markdown_content == "console.log('hi');"
+    assert result.etl_service == "PLAINTEXT"
+
+
+async def test_extract_csv_returns_markdown_table(tmp_path):
+    """A .csv file is converted to a markdown table."""
+    csv_file = tmp_path / "data.csv"
+    csv_file.write_text("name,age\nAlice,30\nBob,25\n", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(csv_file), filename="data.csv")
+    )
+
+    assert "| name | age |" in result.markdown_content
+    assert "| Alice | 30 |" in result.markdown_content
+    assert result.etl_service == "DIRECT_CONVERT"
+    assert result.content_type == "direct_convert"
+
+
+async def test_extract_tsv_returns_markdown_table(tmp_path):
+    """A .tsv file is converted to a markdown table."""
+    tsv_file = tmp_path / "data.tsv"
+    tsv_file.write_text("x\ty\n1\t2\n", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(tsv_file), filename="data.tsv")
+    )
+
+    assert "| x | y |" in result.markdown_content
+    assert result.etl_service == "DIRECT_CONVERT"
+
+
+async def test_extract_html_returns_markdown(tmp_path):
+    """An .html file is converted to markdown."""
+    html_file = tmp_path / "page.html"
+    html_file.write_text("<h1>Title</h1><p>Body</p>", encoding="utf-8")
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(html_file), filename="page.html")
+    )
+
+    assert "Title" in result.markdown_content
+    assert "Body" in result.markdown_content
+    assert result.etl_service == "DIRECT_CONVERT"
+
+
+async def test_extract_mp3_returns_transcription(tmp_path, mocker):
+    """An .mp3 audio file is transcribed via litellm.atranscription."""
+    audio_file = tmp_path / "recording.mp3"
+    audio_file.write_bytes(b"\x00" * 100)
+
+    mocker.patch("app.config.config.STT_SERVICE", "openai/whisper-1")
+    mocker.patch("app.config.config.STT_SERVICE_API_KEY", "fake-key")
+    mocker.patch("app.config.config.STT_SERVICE_API_BASE", None)
+
+    mock_transcription = mocker.patch(
+        "app.etl_pipeline.parsers.audio.atranscription",
+        return_value={"text": "Hello from audio"},
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(audio_file), filename="recording.mp3")
+    )
+
+    assert "Hello from audio" in result.markdown_content
+    assert result.etl_service == "AUDIO"
+    assert result.content_type == "audio"
+    mock_transcription.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# Slice 7 – DOCLING document parsing
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_with_docling(tmp_path, mocker):
+    """A .pdf file with ETL_SERVICE=DOCLING returns parsed markdown."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF"
+    assert result.etl_service == "DOCLING"
+    assert result.content_type == "document"
+
+
+# ---------------------------------------------------------------------------
+# Slice 8 – UNSTRUCTURED document parsing
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_with_unstructured(tmp_path, mocker):
+    """A .pdf file with ETL_SERVICE=UNSTRUCTURED returns parsed markdown."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "UNSTRUCTURED")
+
+    class FakeDoc:
+        def __init__(self, text):
+            self.page_content = text
+
+    fake_loader_instance = mocker.AsyncMock()
+    fake_loader_instance.aload.return_value = [
+        FakeDoc("Page 1 content"),
+        FakeDoc("Page 2 content"),
+    ]
+    mocker.patch(
+        "langchain_unstructured.UnstructuredLoader",
+        return_value=fake_loader_instance,
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert "Page 1 content" in result.markdown_content
+    assert "Page 2 content" in result.markdown_content
+    assert result.etl_service == "UNSTRUCTURED"
+    assert result.content_type == "document"
+
+
+# ---------------------------------------------------------------------------
+# Slice 9 – LLAMACLOUD document parsing
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
+    """A .pdf file with ETL_SERVICE=LLAMACLOUD returns parsed markdown."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
+
+    class FakeDoc:
+        text = "# LlamaCloud parsed"
+
+    class FakeJobResult:
+        pages = []
+
+        def get_markdown_documents(self, split_by_page=True):
+            return [FakeDoc()]
+
+    fake_parser = mocker.AsyncMock()
+    fake_parser.aparse.return_value = FakeJobResult()
+    mocker.patch(
+        "llama_cloud_services.LlamaParse",
+        return_value=fake_parser,
+    )
+    mocker.patch(
+        "llama_cloud_services.parse.utils.ResultType",
+        mocker.MagicMock(MD="md"),
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(
+            file_path=str(pdf_file), filename="report.pdf", estimated_pages=5
+        )
+    )
+
+    assert result.markdown_content == "# LlamaCloud parsed"
+    assert result.etl_service == "LLAMACLOUD"
+    assert result.content_type == "document"
+
+
+# ---------------------------------------------------------------------------
+# Slice 10 – unknown extension falls through to document ETL
+# ---------------------------------------------------------------------------
+
+
+async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
+    """An unknown extension (e.g. .docx) falls through to the document ETL path."""
+    docx_file = tmp_path / "doc.docx"
+    docx_file.write_bytes(b"PK fake docx")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "Docx content"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(docx_file), filename="doc.docx")
+    )
+
+    assert result.markdown_content == "Docx content"
+    assert result.content_type == "document"
+
+
+# ---------------------------------------------------------------------------
+# Slice 11 – EtlRequest validation
+# ---------------------------------------------------------------------------
+
+
+def test_etl_request_requires_filename():
+    """EtlRequest rejects missing filename."""
+    with pytest.raises(Exception):
+        EtlRequest(file_path="/tmp/some.txt", filename="")
+
+
+# ---------------------------------------------------------------------------
+# Slice 12 – unknown ETL_SERVICE raises EtlServiceUnavailableError
+# ---------------------------------------------------------------------------
+
+
+async def test_unknown_etl_service_raises(tmp_path, mocker):
+    """An unknown ETL_SERVICE raises EtlServiceUnavailableError."""
+    from app.etl_pipeline.exceptions import EtlServiceUnavailableError
+
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF fake")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "NONEXISTENT")
+
+    with pytest.raises(EtlServiceUnavailableError, match="Unknown ETL_SERVICE"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+        )

From c6e94188eb83b8b7989b393ea1fe23756273faf2 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 18:23:32 +0530
Subject: [PATCH 10/37] refactor: remove destructive text classes from
 DocumentNode and enhance CreateSearchSpaceDialog with select-none and
 select-text classes

---
 surfsense_web/components/documents/DocumentNode.tsx         | 2 --
 .../layout/ui/dialogs/CreateSearchSpaceDialog.tsx           | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx
index 919f904d4..33ce2bf26 100644
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@@ -260,7 +260,6 @@ export const DocumentNode = React.memo(function DocumentNode({
 								</DropdownMenuItem>
 							)}
 							<DropdownMenuItem
-								className="text-destructive focus:text-destructive"
 								disabled={isProcessing}
 								onClick={() => onDelete(doc)}
 							>
@@ -306,7 +305,6 @@ export const DocumentNode = React.memo(function DocumentNode({
 						</ContextMenuItem>
 					)}
 					<ContextMenuItem
-						className="text-destructive focus:text-destructive"
 						disabled={isProcessing}
 						onClick={() => onDelete(doc)}
 					>
diff --git a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx
index e39bee679..d8ec767d7 100644
--- a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx
+++ b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx
@@ -82,7 +82,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 
 	return (
 		<Dialog open={open} onOpenChange={handleOpenChange}>
-			<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 data-[state=open]:animate-none data-[state=closed]:animate-none">
+			<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 select-none data-[state=open]:animate-none data-[state=closed]:animate-none">
 				<DialogHeader className="space-y-2 pb-2">
 					<div className="flex items-center gap-2 sm:gap-3">
 						<div className="flex-1 min-w-0">
@@ -107,7 +107,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 											placeholder={t("name_placeholder")}
 											{...field}
 											autoFocus
-											className="text-sm h-9 sm:h-10"
+											className="text-sm h-9 sm:h-10 select-text"
 										/>
 									</FormControl>
 									<FormMessage />
@@ -130,7 +130,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 										<Input
 											placeholder={t("description_placeholder")}
 											{...field}
-											className="text-sm h-9 sm:h-10"
+											className="text-sm h-9 sm:h-10 select-text"
 										/>
 									</FormControl>
 									<FormMessage />

From 1f162f52c3199bc336ae1a3ba3a981c2f43e66cb Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 22:50:36 +0530
Subject: [PATCH 11/37] feat: add tooltip functionality to DocumentNode for
 title overflow handling and refactor ChatShareButton by removing unnecessary
 Tooltip wrapper

---
 .../components/documents/DocumentNode.tsx     | 25 ++++++++++++++++++-
 .../components/new-chat/chat-share-button.tsx | 25 ++++++++-----------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx
index 33ce2bf26..92a211277 100644
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@@ -12,6 +12,7 @@ import {
 	Trash2,
 } from "lucide-react";
 import React, { useCallback, useRef, useState } from "react";
+import { useIsMobile } from "@/hooks/use-mobile";
 import { useDrag } from "react-dnd";
 import { getDocumentTypeIcon } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
 import { ExportContextItems, ExportDropdownItems } from "@/components/shared/ExportMenuItems";
@@ -106,7 +107,10 @@ export const DocumentNode = React.memo(function DocumentNode({
 	const isProcessing = statusState === "pending" || statusState === "processing";
 	const [dropdownOpen, setDropdownOpen] = useState(false);
 	const [exporting, setExporting] = useState<string | null>(null);
+	const [titleTooltipOpen, setTitleTooltipOpen] = useState(false);
 	const rowRef = useRef<HTMLDivElement>(null);
+	const titleRef = useRef<HTMLSpanElement>(null);
+	const isMobile = useIsMobile();
 
 	const handleExport = useCallback(
 		(format: string) => {
@@ -118,6 +122,18 @@ export const DocumentNode = React.memo(function DocumentNode({
 		[doc, onExport]
 	);
 
+	const handleTitleTooltipOpenChange = useCallback(
+		(open: boolean) => {
+			if (isMobile) return;
+			if (open && titleRef.current) {
+				setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth);
+			} else {
+				setTitleTooltipOpen(false);
+			}
+		},
+		[isMobile]
+	);
+
 	const attachRef = useCallback(
 		(node: HTMLDivElement | null) => {
 			(rowRef as React.MutableRefObject<HTMLDivElement | null>).current = node;
@@ -197,7 +213,14 @@ export const DocumentNode = React.memo(function DocumentNode({
 						);
 					})()}
 
-					<span className="flex-1 min-w-0 truncate">{doc.title}</span>
+					<Tooltip delayDuration={600} open={titleTooltipOpen} onOpenChange={handleTitleTooltipOpenChange}>
+						<TooltipTrigger asChild>
+							<span ref={titleRef} className="flex-1 min-w-0 truncate">{doc.title}</span>
+						</TooltipTrigger>
+						<TooltipContent side="bottom" className="max-w-xs break-words">
+							{doc.title}
+						</TooltipContent>
+					</Tooltip>
 
 					{getDocumentTypeIcon(
 						doc.document_type as DocumentTypeEnum,
diff --git a/surfsense_web/components/new-chat/chat-share-button.tsx b/surfsense_web/components/new-chat/chat-share-button.tsx
index 82e8c6a78..4fc35aba1 100644
--- a/surfsense_web/components/new-chat/chat-share-button.tsx
+++ b/surfsense_web/components/new-chat/chat-share-button.tsx
@@ -163,21 +163,16 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 			)}
 
 			<Popover open={open} onOpenChange={setOpen}>
-				<Tooltip>
-					<TooltipTrigger asChild>
-						<PopoverTrigger asChild>
-							<Button
-								variant="outline"
-								size="icon"
-								className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-muted/80 border-0 select-none"
-							>
-								<CurrentIcon className="h-4 w-4" />
-								<span className="hidden md:inline text-sm">{buttonLabel}</span>
-							</Button>
-						</PopoverTrigger>
-					</TooltipTrigger>
-					<TooltipContent>Share settings</TooltipContent>
-				</Tooltip>
+				<PopoverTrigger asChild>
+					<Button
+						variant="outline"
+						size="icon"
+						className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-muted/80 border-0 select-none"
+					>
+						<CurrentIcon className="h-4 w-4" />
+						<span className="hidden md:inline text-sm">{buttonLabel}</span>
+					</Button>
+				</PopoverTrigger>
 
 				<PopoverContent
 					className="w-[280px] md:w-[320px] p-0 rounded-lg shadow-lg border-border/60 dark:bg-neutral-900 dark:border dark:border-white/5 select-none"

From c9e5fe9cdb32a456d58af0a665e86d26ffea85aa Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 23:02:17 +0530
Subject: [PATCH 12/37] refactor: update icon usage in CommentActions and
 enhance Tooltip component for mobile responsiveness

---
 .../comment-item/comment-actions.tsx          |   4 +-
 .../components/documents/DocumentNode.tsx     |  20 +--
 .../ui/sidebar/AllPrivateChatsSidebar.tsx     |  36 ++---
 .../ui/sidebar/AllSharedChatsSidebar.tsx      |  36 ++---
 .../layout/ui/sidebar/InboxSidebar.tsx        | 123 ++++++------------
 surfsense_web/components/ui/tooltip.tsx       |  32 ++++-
 6 files changed, 113 insertions(+), 138 deletions(-)

diff --git a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
index 5c0e27779..564a6ba84 100644
--- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
+import { MoreHorizontal, PenLine, Trash2 } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import {
 	DropdownMenu,
@@ -29,7 +29,7 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment
 			<DropdownMenuContent align="end">
 				{canEdit && (
 					<DropdownMenuItem onClick={onEdit}>
-						<Pencil className="mr-2 size-4" />
+						<PenLine className="mr-2 size-4" />
 						Edit
 					</DropdownMenuItem>
 				)}
diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx
index 92a211277..fe796b5be 100644
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@@ -12,7 +12,6 @@ import {
 	Trash2,
 } from "lucide-react";
 import React, { useCallback, useRef, useState } from "react";
-import { useIsMobile } from "@/hooks/use-mobile";
 import { useDrag } from "react-dnd";
 import { getDocumentTypeIcon } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
 import { ExportContextItems, ExportDropdownItems } from "@/components/shared/ExportMenuItems";
@@ -110,7 +109,6 @@ export const DocumentNode = React.memo(function DocumentNode({
 	const [titleTooltipOpen, setTitleTooltipOpen] = useState(false);
 	const rowRef = useRef<HTMLDivElement>(null);
 	const titleRef = useRef<HTMLSpanElement>(null);
-	const isMobile = useIsMobile();
 
 	const handleExport = useCallback(
 		(format: string) => {
@@ -122,17 +120,13 @@ export const DocumentNode = React.memo(function DocumentNode({
 		[doc, onExport]
 	);
 
-	const handleTitleTooltipOpenChange = useCallback(
-		(open: boolean) => {
-			if (isMobile) return;
-			if (open && titleRef.current) {
-				setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth);
-			} else {
-				setTitleTooltipOpen(false);
-			}
-		},
-		[isMobile]
-	);
+	const handleTitleTooltipOpenChange = useCallback((open: boolean) => {
+		if (open && titleRef.current) {
+			setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth);
+		} else {
+			setTitleTooltipOpen(false);
+		}
+	}, []);
 
 	const attachRef = useCallback(
 		(node: HTMLDivElement | null) => {
diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
index 57c011f01..46b03a172 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
@@ -375,24 +375,24 @@ export function AllPrivateChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-										<Tooltip>
-											<TooltipTrigger asChild>
-												<button
-													type="button"
-													onClick={() => handleThreadClick(thread.id)}
-													disabled={isBusy}
-													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
-												>
-													<span className="truncate">{thread.title || "New Chat"}</span>
-												</button>
-											</TooltipTrigger>
-											<TooltipContent side="bottom" align="start">
-												<p>
-													{t("updated") || "Updated"}:{" "}
-													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
-												</p>
-											</TooltipContent>
-										</Tooltip>
+									<Tooltip delayDuration={600}>
+										<TooltipTrigger asChild>
+											<button
+												type="button"
+												onClick={() => handleThreadClick(thread.id)}
+												disabled={isBusy}
+												className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+											>
+												<span className="truncate">{thread.title || "New Chat"}</span>
+											</button>
+										</TooltipTrigger>
+										<TooltipContent side="bottom" align="start">
+											<p>
+												{t("updated") || "Updated"}:{" "}
+												{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+											</p>
+										</TooltipContent>
+									</Tooltip>
 									)}
 
 									<DropdownMenu
diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
index 4a59df440..9cc1da1e4 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
@@ -375,24 +375,24 @@ export function AllSharedChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-										<Tooltip>
-											<TooltipTrigger asChild>
-												<button
-													type="button"
-													onClick={() => handleThreadClick(thread.id)}
-													disabled={isBusy}
-													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
-												>
-													<span className="truncate">{thread.title || "New Chat"}</span>
-												</button>
-											</TooltipTrigger>
-											<TooltipContent side="bottom" align="start">
-												<p>
-													{t("updated") || "Updated"}:{" "}
-													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
-												</p>
-											</TooltipContent>
-										</Tooltip>
+									<Tooltip delayDuration={600}>
+										<TooltipTrigger asChild>
+											<button
+												type="button"
+												onClick={() => handleThreadClick(thread.id)}
+												disabled={isBusy}
+												className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+											>
+												<span className="truncate">{thread.title || "New Chat"}</span>
+											</button>
+										</TooltipTrigger>
+										<TooltipContent side="bottom" align="start">
+											<p>
+												{t("updated") || "Updated"}:{" "}
+												{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+											</p>
+										</TooltipContent>
+									</Tooltip>
 									)}
 
 									<DropdownMenu
diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
index 72400a589..51ecedd94 100644
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@@ -779,36 +779,23 @@ export function InboxSidebarContent({
 								</DropdownMenuContent>
 							</DropdownMenu>
 						)}
-						{isMobile ? (
-							<Button
-								variant="ghost"
-								size="icon"
-								className="h-7 w-7 rounded-full"
-								onClick={handleMarkAllAsRead}
-								disabled={totalUnreadCount === 0}
-							>
-								<CheckCheck className="h-4 w-4 text-muted-foreground" />
-								<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
-							</Button>
-						) : (
-							<Tooltip>
-								<TooltipTrigger asChild>
-									<Button
-										variant="ghost"
-										size="icon"
-										className="h-7 w-7 rounded-full"
-										onClick={handleMarkAllAsRead}
-										disabled={totalUnreadCount === 0}
-									>
-										<CheckCheck className="h-4 w-4 text-muted-foreground" />
-										<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
-									</Button>
-								</TooltipTrigger>
-								<TooltipContent className="z-80">
-									{t("mark_all_read") || "Mark all as read"}
-								</TooltipContent>
-							</Tooltip>
-						)}
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<Button
+									variant="ghost"
+									size="icon"
+									className="h-7 w-7 rounded-full"
+									onClick={handleMarkAllAsRead}
+									disabled={totalUnreadCount === 0}
+								>
+									<CheckCheck className="h-4 w-4 text-muted-foreground" />
+									<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
+								</Button>
+							</TooltipTrigger>
+							<TooltipContent className="z-80">
+								{t("mark_all_read") || "Mark all as read"}
+							</TooltipContent>
+						</Tooltip>
 					</div>
 				</div>
 
@@ -921,61 +908,27 @@ export function InboxSidebarContent({
 									)}
 									style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }}
 								>
-									{isMobile ? (
-										<button
-											type="button"
-											onClick={() => handleItemClick(item)}
-											disabled={isMarkingAsRead}
-											className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
-										>
-											<div className="shrink-0">{getStatusIcon(item)}</div>
-											<div className="flex-1 min-w-0 overflow-hidden">
-												<p
-													className={cn(
-														"text-xs font-medium line-clamp-2",
-														!item.read && "font-semibold"
-													)}
-												>
-													{item.title}
-												</p>
-												<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
-													{convertRenderedToDisplay(item.message)}
-												</p>
-											</div>
-										</button>
-									) : (
-										<Tooltip>
-											<TooltipTrigger asChild>
-												<button
-													type="button"
-													onClick={() => handleItemClick(item)}
-													disabled={isMarkingAsRead}
-													className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
-												>
-													<div className="shrink-0">{getStatusIcon(item)}</div>
-													<div className="flex-1 min-w-0 overflow-hidden">
-														<p
-															className={cn(
-																"text-xs font-medium line-clamp-2",
-																!item.read && "font-semibold"
-															)}
-														>
-															{item.title}
-														</p>
-														<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
-															{convertRenderedToDisplay(item.message)}
-														</p>
-													</div>
-												</button>
-											</TooltipTrigger>
-											<TooltipContent side="bottom" align="start" className="max-w-[250px]">
-												<p className="font-medium">{item.title}</p>
-												<p className="text-muted-foreground mt-1">
-													{convertRenderedToDisplay(item.message)}
-												</p>
-											</TooltipContent>
-										</Tooltip>
-									)}
+									<button
+										type="button"
+										onClick={() => handleItemClick(item)}
+										disabled={isMarkingAsRead}
+										className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
+									>
+										<div className="shrink-0">{getStatusIcon(item)}</div>
+										<div className="flex-1 min-w-0 overflow-hidden">
+											<p
+												className={cn(
+													"text-xs font-medium line-clamp-2",
+													!item.read && "font-semibold"
+												)}
+											>
+												{item.title}
+											</p>
+											<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
+												{convertRenderedToDisplay(item.message)}
+											</p>
+										</div>
+									</button>
 
 									<div className="flex items-center justify-end gap-1.5 shrink-0 w-10">
 										<span className="text-[10px] text-muted-foreground">
diff --git a/surfsense_web/components/ui/tooltip.tsx b/surfsense_web/components/ui/tooltip.tsx
index 2fc85aae4..fb6dd17e7 100644
--- a/surfsense_web/components/ui/tooltip.tsx
+++ b/surfsense_web/components/ui/tooltip.tsx
@@ -2,9 +2,26 @@
 
 import * as TooltipPrimitive from "@radix-ui/react-tooltip";
 import type * as React from "react";
+import { useEffect, useState } from "react";
 
 import { cn } from "@/lib/utils";
 
+const MOBILE_BREAKPOINT = 768;
+
+function useIsTouchDevice() {
+	const [isTouch, setIsTouch] = useState(false);
+
+	useEffect(() => {
+		const mql = window.matchMedia(`(max-width: ${MOBILE_BREAKPOINT - 1}px)`);
+		const update = () => setIsTouch(mql.matches);
+		update();
+		mql.addEventListener("change", update);
+		return () => mql.removeEventListener("change", update);
+	}, []);
+
+	return isTouch;
+}
+
 function TooltipProvider({
 	delayDuration = 0,
 	disableHoverableContent = true,
@@ -20,10 +37,21 @@ function TooltipProvider({
 	);
 }
 
-function Tooltip({ ...props }: React.ComponentProps<typeof TooltipPrimitive.Root>) {
+function Tooltip({
+	open,
+	onOpenChange,
+	...props
+}: React.ComponentProps<typeof TooltipPrimitive.Root>) {
+	const isMobile = useIsTouchDevice();
+
 	return (
 		<TooltipProvider>
-			<TooltipPrimitive.Root data-slot="tooltip" {...props} />
+			<TooltipPrimitive.Root
+				data-slot="tooltip"
+				open={isMobile ? false : open}
+				onOpenChange={isMobile ? undefined : onOpenChange}
+				{...props}
+			/>
 		</TooltipProvider>
 	);
 }

From 7fa1810d5061ef7ad5896e69c1cd4bbdbfc90c59 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sun, 5 Apr 2026 23:14:54 +0530
Subject: [PATCH 13/37] refactor: simplify CommentComposer button layout and
 update placeholder text in CommentItem

---
 .../comment-composer/comment-composer.tsx     | 26 ++++++++-----------
 .../comment-item/comment-actions.tsx          |  4 +--
 .../comment-item/comment-item.tsx             |  2 +-
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
index 3d6ea384b..e14022f5c 100644
--- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
+++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { ArrowUp, Send, X } from "lucide-react";
+import { ArrowUp } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { Button } from "@/components/ui/button";
 import { Popover, PopoverAnchor, PopoverContent } from "@/components/ui/popover";
@@ -300,16 +300,15 @@ export function CommentComposer({
 
 			<div className={cn("flex items-center gap-2", !compact && "justify-end")}>
 				{onCancel && (
-					<Button
-						type="button"
-						variant="ghost"
-						size="sm"
-						onClick={onCancel}
-						disabled={isSubmitting}
-					>
-						<X className="mr-1 size-4" />
-						Cancel
-					</Button>
+				<Button
+					type="button"
+					variant="ghost"
+					size="sm"
+					onClick={onCancel}
+					disabled={isSubmitting}
+				>
+					Cancel
+				</Button>
 				)}
 				<Button
 					type="button"
@@ -321,10 +320,7 @@ export function CommentComposer({
 					{compact ? (
 						<ArrowUp className="size-4" />
 					) : (
-						<>
-							<Send className="mr-1 size-4" />
-							{submitLabel}
-						</>
+						submitLabel
 					)}
 				</Button>
 			</div>
diff --git a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
index 564a6ba84..9638ac01c 100644
--- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
@@ -21,9 +21,9 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment
 				<Button
 					variant="ghost"
 					size="icon"
-					className="size-7 opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
+					className="size-7 text-muted-foreground opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
 				>
-					<MoreHorizontal className="size-4 text-muted-foreground" />
+					<MoreHorizontal className="size-4" />
 				</Button>
 			</DropdownMenuTrigger>
 			<DropdownMenuContent align="end">
diff --git a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
index 4996fe01b..eb374ba49 100644
--- a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
@@ -198,7 +198,7 @@ export function CommentItem({
 						<CommentComposer
 							members={members}
 							membersLoading={membersLoading}
-							placeholder="Edit your comment..."
+							placeholder="Edit your comment"
 							submitLabel="Save"
 							isSubmitting={isSubmitting}
 							onSubmit={handleEditSubmit}

From 742548847a520204ec9dcdaa95b25c26336501f5 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:14:17 +0530
Subject: [PATCH 14/37] refactor: optimize navigation items in
 LayoutDataProvider, enhance button layout in InboxSidebar with tooltip
 support, full width in PageUsageDisplay

---
 .../assistant-ui/connector-popup.tsx          |  2 +-
 .../assistant-ui/document-upload-popup.tsx    |  2 +-
 .../layout/providers/LayoutDataProvider.tsx   | 52 +++++++------
 .../layout/ui/right-panel/RightPanel.tsx      |  2 +-
 .../layout/ui/sidebar/InboxSidebar.tsx        | 76 ++++++++++++++-----
 .../layout/ui/sidebar/PageUsageDisplay.tsx    |  4 +-
 6 files changed, 87 insertions(+), 51 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index fe7b29f28..c41e986d4 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -216,7 +216,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 					onPointerDownOutside={(e) => {
 						if (pickerOpen) e.preventDefault();
 					}}
-					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5 select-none"
+					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button>svg]:size-5 select-none"
 				>
 					<DialogTitle className="sr-only">Manage Connectors</DialogTitle>
 					{/* YouTube Crawler View - shown when adding YouTube videos */}
diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
index 59d73e651..fdd4131e5 100644
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@@ -125,7 +125,7 @@ const DocumentUploadPopupContent: FC<{
 				onPointerDownOutside={(e) => e.preventDefault()}
 				onInteractOutside={(e) => e.preventDefault()}
 				onEscapeKeyDown={(e) => e.preventDefault()}
-				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5"
+				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
 			>
 				<DialogTitle className="sr-only">Upload Document</DialogTitle>
 
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 6138b67fb..74c3c64de 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -347,35 +347,37 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 
 	// Navigation items
 	const navItems: NavItem[] = useMemo(
-		() => [
-			{
-				title: "Inbox",
-				url: "#inbox",
-				icon: Inbox,
-				isActive: isInboxSidebarOpen,
-				badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined,
-			},
-			{
-				title: "Documents",
-				url: "#documents",
-				icon: SquareLibrary,
-				isActive: isMobile
-					? isDocumentsSidebarOpen
-					: isDocumentsSidebarOpen && !isRightPanelCollapsed,
-			},
-			{
-				title: "Announcements",
-				url: "#announcements",
-				icon: Megaphone,
-				isActive: isAnnouncementsSidebarOpen,
-				badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
-			},
-		],
+		() =>
+			(
+				[
+					{
+						title: "Inbox",
+						url: "#inbox",
+						icon: Inbox,
+						isActive: isInboxSidebarOpen,
+						badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined,
+					},
+					isMobile
+						? {
+								title: "Documents",
+								url: "#documents",
+								icon: SquareLibrary,
+								isActive: isDocumentsSidebarOpen,
+							}
+						: null,
+					{
+						title: "Announcements",
+						url: "#announcements",
+						icon: Megaphone,
+						isActive: isAnnouncementsSidebarOpen,
+						badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
+					},
+				] as (NavItem | null)[]
+			).filter((item): item is NavItem => item !== null),
 		[
 			isMobile,
 			isInboxSidebarOpen,
 			isDocumentsSidebarOpen,
-			isRightPanelCollapsed,
 			totalUnreadCount,
 			isAnnouncementsSidebarOpen,
 			announcementUnreadCount,
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index 717f5a459..59683b6dc 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -78,7 +78,7 @@ export function RightPanelExpandButton() {
 	if (!collapsed || !hasContent) return null;
 
 	return (
-		<div className="flex shrink-0 items-center px-1">
+		<div className="flex shrink-0 items-center px-0.5">
 			<Tooltip>
 				<TooltipTrigger asChild>
 					<Button
diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
index 51ecedd94..0b8953ae0 100644
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@@ -908,27 +908,61 @@ export function InboxSidebarContent({
 									)}
 									style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }}
 								>
-									<button
-										type="button"
-										onClick={() => handleItemClick(item)}
-										disabled={isMarkingAsRead}
-										className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
-									>
-										<div className="shrink-0">{getStatusIcon(item)}</div>
-										<div className="flex-1 min-w-0 overflow-hidden">
-											<p
-												className={cn(
-													"text-xs font-medium line-clamp-2",
-													!item.read && "font-semibold"
-												)}
-											>
-												{item.title}
-											</p>
-											<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
-												{convertRenderedToDisplay(item.message)}
-											</p>
-										</div>
-									</button>
+									{activeTab === "status" ? (
+										<Tooltip delayDuration={600}>
+											<TooltipTrigger asChild>
+												<button
+													type="button"
+													onClick={() => handleItemClick(item)}
+													disabled={isMarkingAsRead}
+													className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
+												>
+													<div className="shrink-0">{getStatusIcon(item)}</div>
+													<div className="flex-1 min-w-0 overflow-hidden">
+														<p
+															className={cn(
+																"text-xs font-medium line-clamp-2",
+																!item.read && "font-semibold"
+															)}
+														>
+															{item.title}
+														</p>
+														<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
+															{convertRenderedToDisplay(item.message)}
+														</p>
+													</div>
+												</button>
+											</TooltipTrigger>
+											<TooltipContent side="bottom" align="start" className="max-w-[250px]">
+												<p className="font-medium">{item.title}</p>
+												<p className="text-muted-foreground mt-1">
+													{convertRenderedToDisplay(item.message)}
+												</p>
+											</TooltipContent>
+										</Tooltip>
+									) : (
+										<button
+											type="button"
+											onClick={() => handleItemClick(item)}
+											disabled={isMarkingAsRead}
+											className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
+										>
+											<div className="shrink-0">{getStatusIcon(item)}</div>
+											<div className="flex-1 min-w-0 overflow-hidden">
+												<p
+													className={cn(
+														"text-xs font-medium line-clamp-2",
+														!item.read && "font-semibold"
+													)}
+												>
+													{item.title}
+												</p>
+												<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
+													{convertRenderedToDisplay(item.message)}
+												</p>
+											</div>
+										</button>
+									)}
 
 									<div className="flex items-center justify-end gap-1.5 shrink-0 w-10">
 										<span className="text-[10px] text-muted-foreground">
diff --git a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
index 73347e304..7dd46e484 100644
--- a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
@@ -35,7 +35,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
 				<Progress value={usagePercentage} className="h-1.5" />
 				<Link
 					href={`/dashboard/${searchSpaceId}/more-pages`}
-					className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
+					className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
 				>
 					<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
 						<Zap className="h-3 w-3 shrink-0" />
@@ -48,7 +48,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
 				{pageBuyingEnabled && (
 					<Link
 						href={`/dashboard/${searchSpaceId}/buy-pages`}
-						className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
+						className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
 					>
 						<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
 							<CreditCard className="h-3 w-3 shrink-0" />

From 46c15c11dadfced3759f3c04afeb4eef41dc482a Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:29:55 +0530
Subject: [PATCH 15/37] refactor: update layout and styling in
 DocumentUploadPopup for improved visual hierarchy and spacing

---
 .../components/assistant-ui/document-upload-popup.tsx     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
index fdd4131e5..7b0409345 100644
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@@ -125,18 +125,18 @@ const DocumentUploadPopupContent: FC<{
 				onPointerDownOutside={(e) => e.preventDefault()}
 				onInteractOutside={(e) => e.preventDefault()}
 				onEscapeKeyDown={(e) => e.preventDefault()}
-				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
+				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
 			>
 				<DialogTitle className="sr-only">Upload Document</DialogTitle>
 
 				<div className="flex-1 min-h-0 overflow-y-auto overscroll-contain">
-					<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-4 sm:pt-5 pb-10">
+					<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
 						<div className="flex items-center gap-2 mb-1 pr-8 sm:pr-0">
-							<h2 className="text-base sm:text-lg font-semibold tracking-tight">
+							<h2 className="text-xl sm:text-3xl font-semibold tracking-tight">
 								Upload Documents
 							</h2>
 						</div>
-						<p className="text-xs sm:text-sm text-muted-foreground line-clamp-1">
+						<p className="text-xs sm:text-base text-muted-foreground/80 line-clamp-1">
 							Upload and sync your documents to your search space
 						</p>
 					</div>

From 02323e7b55da78c3de0ec71dd20904f6bed49c16 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:56:29 +0530
Subject: [PATCH 16/37] refactor: enhance DocumentsFilters component with
 ToggleGroup for folder creation and improve search functionality

---
 .../(manage)/components/DocumentsFilters.tsx  | 232 +++++++++---------
 .../components/new-chat/model-selector.tsx    |   7 +-
 .../components/shared/image-config-dialog.tsx |   2 +-
 .../components/shared/model-config-dialog.tsx |   2 +-
 4 files changed, 124 insertions(+), 119 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
index b85af13b7..b043c61e9 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
@@ -8,6 +8,7 @@ import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import type { DocumentTypeEnum } from "@/contracts/types/document.types";
 import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
@@ -63,109 +64,129 @@ export function DocumentsFilters({
 	return (
 		<div className="flex select-none">
 			<div className="flex items-center gap-2 w-full">
-				{/* Type Filter */}
-				<Popover>
-					<PopoverTrigger asChild>
-						<Button
-							variant="outline"
-							size="icon"
-							className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
-						>
-							<ListFilter size={14} />
-							{activeTypes.length > 0 && (
-								<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-primary text-[9px] font-medium text-primary-foreground">
-									{activeTypes.length}
-								</span>
-							)}
-						</Button>
-					</PopoverTrigger>
-					<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="end">
-						<div>
-							{/* Search input */}
-							<div className="p-2">
-								<div className="relative">
-									<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
-									<Input
-										placeholder="Search types"
-										value={typeSearchQuery}
-										onChange={(e) => setTypeSearchQuery(e.target.value)}
-										className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none"
-									/>
-								</div>
-							</div>
+				{/* Filter + New Folder Toggle Group */}
+				<ToggleGroup type="multiple" variant="outline" value={[]}>
+					{onCreateFolder && (
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<ToggleGroupItem
+									value="folder"
+									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
+									onClick={(e) => {
+										e.preventDefault();
+										onCreateFolder();
+									}}
+								>
+									<FolderPlus size={14} />
+								</ToggleGroupItem>
+							</TooltipTrigger>
+							<TooltipContent>New folder</TooltipContent>
+						</Tooltip>
+					)}
 
-							<div
-								className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5"
-								onScroll={handleScroll}
-								style={{
-									maskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
-									WebkitMaskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
-								}}
-							>
-								{filteredTypes.length === 0 ? (
-									<div className="py-6 text-center text-sm text-muted-foreground">
-										No types found
+					<Popover>
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<PopoverTrigger asChild>
+									<ToggleGroupItem
+										value="filter"
+										className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
+									>
+										<ListFilter size={14} />
+										{activeTypes.length > 0 && (
+											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-primary text-[9px] font-medium text-primary-foreground">
+												{activeTypes.length}
+											</span>
+										)}
+									</ToggleGroupItem>
+								</PopoverTrigger>
+							</TooltipTrigger>
+							<TooltipContent>Filter by type</TooltipContent>
+						</Tooltip>
+						<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="start">
+							<div>
+								<div className="p-2">
+									<div className="relative">
+										<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
+										<Input
+											placeholder="Search types"
+											value={typeSearchQuery}
+											onChange={(e) => setTypeSearchQuery(e.target.value)}
+											className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none"
+										/>
 									</div>
-								) : (
-									filteredTypes.map((value: DocumentTypeEnum, i) => (
-										<div
-											role="option"
-											aria-selected={activeTypes.includes(value)}
-											tabIndex={0}
-											key={value}
-											className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-neutral-200 dark:hover:bg-neutral-700 transition-colors cursor-pointer text-left"
-											onClick={() => onToggleType(value, !activeTypes.includes(value))}
-											onKeyDown={(e) => {
-												if (e.key === "Enter" || e.key === " ") {
-													e.preventDefault();
-													onToggleType(value, !activeTypes.includes(value));
-												}
+								</div>
+
+								<div
+									className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5"
+									onScroll={handleScroll}
+									style={{
+										maskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
+										WebkitMaskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
+									}}
+								>
+									{filteredTypes.length === 0 ? (
+										<div className="py-6 text-center text-sm text-muted-foreground">
+											No types found
+										</div>
+									) : (
+										filteredTypes.map((value: DocumentTypeEnum, i) => (
+											<div
+												role="option"
+												aria-selected={activeTypes.includes(value)}
+												tabIndex={0}
+												key={value}
+												className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-neutral-200 dark:hover:bg-neutral-700 transition-colors cursor-pointer text-left"
+												onClick={() => onToggleType(value, !activeTypes.includes(value))}
+												onKeyDown={(e) => {
+													if (e.key === "Enter" || e.key === " ") {
+														e.preventDefault();
+														onToggleType(value, !activeTypes.includes(value));
+													}
+												}}
+											>
+												<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
+													{getDocumentTypeIcon(value, "h-4 w-4")}
+												</div>
+												<div className="flex flex-col min-w-0 flex-1 gap-0.5">
+													<span className="text-[13px] font-medium text-foreground truncate leading-tight">
+														{getDocumentTypeLabel(value)}
+													</span>
+													<span className="text-[11px] text-muted-foreground leading-tight">
+														{typeCounts.get(value)} document
+														{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
+													</span>
+												</div>
+												<Checkbox
+													id={`${id}-${i}`}
+													checked={activeTypes.includes(value)}
+													onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
+													className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
+												/>
+											</div>
+										))
+									)}
+								</div>
+								{activeTypes.length > 0 && (
+									<div className="px-3 pt-1.5 pb-1.5 border-t border-border dark:border-neutral-700">
+										<Button
+											variant="ghost"
+											size="sm"
+											className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground hover:bg-neutral-200 dark:hover:bg-neutral-700"
+											onClick={() => {
+												activeTypes.forEach((t) => {
+													onToggleType(t, false);
+												});
 											}}
 										>
-											{/* Icon */}
-											<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
-												{getDocumentTypeIcon(value, "h-4 w-4")}
-											</div>
-											{/* Text content */}
-											<div className="flex flex-col min-w-0 flex-1 gap-0.5">
-												<span className="text-[13px] font-medium text-foreground truncate leading-tight">
-													{getDocumentTypeLabel(value)}
-												</span>
-												<span className="text-[11px] text-muted-foreground leading-tight">
-													{typeCounts.get(value)} document
-													{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
-												</span>
-											</div>
-											{/* Checkbox */}
-											<Checkbox
-												id={`${id}-${i}`}
-												checked={activeTypes.includes(value)}
-												onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
-												className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
-											/>
-										</div>
-									))
+											Clear filters
+										</Button>
+									</div>
 								)}
 							</div>
-							{activeTypes.length > 0 && (
-								<div className="px-3 pt-1.5 pb-1.5 border-t border-border dark:border-neutral-700">
-									<Button
-										variant="ghost"
-										size="sm"
-										className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground hover:bg-neutral-200 dark:hover:bg-neutral-700"
-										onClick={() => {
-											activeTypes.forEach((t) => {
-												onToggleType(t, false);
-											});
-										}}
-									>
-										Clear filters
-									</Button>
-								</div>
-							)}
-						</div>
-					</PopoverContent>
-				</Popover>
+						</PopoverContent>
+					</Popover>
+				</ToggleGroup>
 
 				{/* Search Input */}
 				<div className="relative flex-1 min-w-0">
@@ -197,23 +218,6 @@ export function DocumentsFilters({
 					)}
 				</div>
 
-				{/* New Folder Button */}
-				{onCreateFolder && (
-					<Tooltip>
-						<TooltipTrigger asChild>
-							<Button
-								variant="outline"
-								size="icon"
-								className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
-								onClick={onCreateFolder}
-							>
-								<FolderPlus size={14} />
-							</Button>
-						</TooltipTrigger>
-						<TooltipContent>New folder</TooltipContent>
-					</Tooltip>
-				)}
-
 				{/* Upload Button */}
 				<Button
 					data-joyride="upload-button"
diff --git a/surfsense_web/components/new-chat/model-selector.tsx b/surfsense_web/components/new-chat/model-selector.tsx
index b207d82b4..39f88f794 100644
--- a/surfsense_web/components/new-chat/model-selector.tsx
+++ b/surfsense_web/components/new-chat/model-selector.tsx
@@ -1,7 +1,7 @@
 "use client";
 
 import { useAtomValue } from "jotai";
-import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Zap } from "lucide-react";
+import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Search, Zap } from "lucide-react";
 import { type UIEvent, useCallback, useMemo, useState } from "react";
 import { toast } from "sonner";
 import {
@@ -344,7 +344,7 @@ export function ModelSelector({
 							>
 								<CommandEmpty className="py-8 text-center">
 									<div className="flex flex-col items-center gap-2">
-										<Bot className="size-8 text-muted-foreground" />
+										<Search className="size-8 text-muted-foreground" />
 										<p className="text-sm text-muted-foreground">No models found</p>
 										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
 									</div>
@@ -531,8 +531,9 @@ export function ModelSelector({
 							>
 								<CommandEmpty className="py-8 text-center">
 									<div className="flex flex-col items-center gap-2">
-										<ImageIcon className="size-8 text-muted-foreground" />
+										<Search className="size-8 text-muted-foreground" />
 										<p className="text-sm text-muted-foreground">No image models found</p>
+										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
 									</div>
 								</CommandEmpty>
 
diff --git a/surfsense_web/components/shared/image-config-dialog.tsx b/surfsense_web/components/shared/image-config-dialog.tsx
index 1cfbf8842..2ae53ccca 100644
--- a/surfsense_web/components/shared/image-config-dialog.tsx
+++ b/surfsense_web/components/shared/image-config-dialog.tsx
@@ -433,7 +433,7 @@ export function ImageConfigDialog({
 							className="relative text-sm h-9 min-w-[120px]"
 						>
 							<span className={isSubmitting ? "opacity-0" : ""}>
-								{mode === "edit" ? "Save Changes" : "Create & Use"}
+								{mode === "edit" ? "Save Changes" : "Add Model"}
 							</span>
 							{isSubmitting && <Spinner size="sm" className="absolute" />}
 						</Button>
diff --git a/surfsense_web/components/shared/model-config-dialog.tsx b/surfsense_web/components/shared/model-config-dialog.tsx
index 84ba821fc..4d2373b49 100644
--- a/surfsense_web/components/shared/model-config-dialog.tsx
+++ b/surfsense_web/components/shared/model-config-dialog.tsx
@@ -312,7 +312,7 @@ export function ModelConfigDialog({
 							className="relative text-sm h-9 min-w-[120px]"
 						>
 							<span className={isSubmitting ? "opacity-0" : ""}>
-								{mode === "edit" ? "Save Changes" : "Create & Use"}
+								{mode === "edit" ? "Save Changes" : "Add Model"}
 							</span>
 							{isSubmitting && <Spinner size="sm" className="absolute" />}
 						</Button>

From 8259fab254871b064e37239562fdd1104eb10bd9 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:27:49 +0530
Subject: [PATCH 17/37] refactor: update connector tabs to include search
 feedback and improve icon usage for better user experience

---
 .../connector-popup/tabs/active-connectors-tab.tsx | 14 +++++++++++---
 .../connector-popup/tabs/all-connectors-tab.tsx    | 13 +++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
index ccf1476dd..55fc99150 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { Cable } from "lucide-react";
+import { Search, Unplug } from "lucide-react";
 import type { FC } from "react";
 import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
 import { Button } from "@/components/ui/button";
@@ -134,9 +134,17 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 	const hasActiveConnectors =
 		filteredOAuthConnectorTypes.length > 0 || filteredNonOAuthConnectors.length > 0;
 
+	const hasFilteredResults = hasActiveConnectors || standaloneDocuments.length > 0;
+
 	return (
 		<TabsContent value="active" className="m-0">
-			{hasSources ? (
+			{hasSources && !hasFilteredResults && searchQuery ? (
+				<div className="flex flex-col items-center justify-center py-20 text-center">
+					<Search className="size-8 text-muted-foreground mb-3" />
+					<p className="text-sm text-muted-foreground">No connectors found</p>
+					<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
+				</div>
+			) : hasSources ? (
 				<div className="space-y-6">
 					{/* Active Connectors Section */}
 					{hasActiveConnectors && (
@@ -302,7 +310,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 			) : (
 				<div className="flex flex-col items-center justify-center py-20 text-center">
 					<div className="flex h-16 w-16 items-center justify-center rounded-full bg-muted mb-4">
-						<Cable className="size-8 text-muted-foreground" />
+						<Unplug className="size-8 text-muted-foreground" />
 					</div>
 					<h4 className="text-lg font-semibold">No active sources</h4>
 					<p className="text-sm text-muted-foreground mt-1 max-w-[280px]">
diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx
index 3e8aad620..0afc192da 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx
@@ -1,5 +1,6 @@
 "use client";
 
+import { Search } from "lucide-react";
 import type { FC } from "react";
 import { EnumConnectorName } from "@/contracts/enums/connector";
 import type { SearchSourceConnector } from "@/contracts/types/connector.types";
@@ -287,6 +288,18 @@ export const AllConnectorsTab: FC<AllConnectorsTabProps> = ({
 		moreIntegrationsOther.length > 0 ||
 		moreIntegrationsCrawlers.length > 0;
 
+	const hasAnyResults = hasDocumentFileConnectors || hasMoreIntegrations;
+
+	if (!hasAnyResults && searchQuery) {
+		return (
+			<div className="flex flex-col items-center justify-center py-20 text-center">
+				<Search className="size-8 text-muted-foreground mb-3" />
+				<p className="text-sm text-muted-foreground">No connectors found</p>
+				<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
+			</div>
+		);
+	}
+
 	return (
 		<div className="space-y-8">
 			{/* Document/Files Connectors */}

From 3251f0e98d700373cdefe3eeb03220916ec1d28c Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:56:28 +0530
Subject: [PATCH 18/37] refactor: remove childCount prop from FolderNode and
 optimize FolderTreeView by eliminating unnecessary child count calculations

---
 surfsense_web/components/documents/FolderNode.tsx   |  8 --------
 .../components/documents/FolderTreeView.tsx         | 13 +------------
 .../components/layout/ui/right-panel/RightPanel.tsx |  5 ++---
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx
index 88cc76c69..2ec430871 100644
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@@ -56,7 +56,6 @@ interface FolderNodeProps {
 	depth: number;
 	isExpanded: boolean;
 	isRenaming: boolean;
-	childCount: number;
 	selectionState: FolderSelectionState;
 	processingState: "idle" | "processing" | "failed";
 	onToggleSelect: (folderId: number, selectAll: boolean) => void;
@@ -101,7 +100,6 @@ export const FolderNode = React.memo(function FolderNode({
 	depth,
 	isExpanded,
 	isRenaming,
-	childCount,
 	selectionState,
 	processingState,
 	onToggleSelect,
@@ -336,12 +334,6 @@ export const FolderNode = React.memo(function FolderNode({
 						<span className="flex-1 min-w-0 truncate">{folder.name}</span>
 					)}
 
-					{!isRenaming && childCount > 0 && (
-						<span className="shrink-0 text-[10px] text-muted-foreground tabular-nums">
-							{childCount}
-						</span>
-					)}
-
 					{!isRenaming && (
 						<DropdownMenu>
 							<DropdownMenuTrigger asChild>
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index 1df007c0b..d9c74e0db 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -86,16 +86,6 @@ export function FolderTreeView({
 
 	const docsByFolder = useMemo(() => groupBy(documents, (d) => d.folderId ?? "root"), [documents]);
 
-	const folderChildCounts = useMemo(() => {
-		const counts: Record<number, number> = {};
-		for (const f of folders) {
-			const children = foldersByParent[f.id] ?? [];
-			const docs = docsByFolder[f.id] ?? [];
-			counts[f.id] = children.length + docs.length;
-		}
-		return counts;
-	}, [folders, foldersByParent, docsByFolder]);
-
 	const [openContextMenuId, setOpenContextMenuId] = useState<string | null>(null);
 
 	// Single subscription for rename state — derived boolean passed to each FolderNode
@@ -226,8 +216,7 @@ export function FolderTreeView({
 					depth={depth}
 					isExpanded={isExpanded}
 					isRenaming={renamingFolderId === f.id}
-					childCount={folderChildCounts[f.id] ?? 0}
-					selectionState={folderSelectionStates[f.id] ?? "none"}
+				selectionState={folderSelectionStates[f.id] ?? "none"}
 					processingState={folderProcessingStates[f.id] ?? "idle"}
 					onToggleSelect={onToggleFolderSelect}
 					onToggleExpand={onToggleExpand}
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index 59683b6dc..83b7d5d1f 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -10,7 +10,6 @@ import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
 import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
 import { Button } from "@/components/ui/button";
-import { Skeleton } from "@/components/ui/skeleton";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { DocumentsSidebar } from "../sidebar";
 
@@ -27,7 +26,7 @@ const HitlEditPanelContent = dynamic(
 		import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({
 			default: m.HitlEditPanelContent,
 		})),
-	{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
+	{ ssr: false, loading: () => null }
 );
 
 const ReportPanelContent = dynamic(
@@ -35,7 +34,7 @@ const ReportPanelContent = dynamic(
 		import("@/components/report-panel/report-panel").then((m) => ({
 			default: m.ReportPanelContent,
 		})),
-	{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
+	{ ssr: false, loading: () => null }
 );
 
 interface RightPanelProps {

From be7e73e615a850e4adee13aff6d91654abefdf51 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:41:53 +0530
Subject: [PATCH 19/37] refactor: enhance DocumentsFilters and FolderTreeView
 components for improved filter handling and search functionality

---
 .../(manage)/components/DocumentsFilters.tsx  | 22 +++----------------
 .../components/documents/FolderTreeView.tsx   | 17 +++++++++-----
 .../layout/ui/right-panel/RightPanel.tsx      |  2 +-
 .../layout/ui/sidebar/DocumentsSidebar.tsx    |  5 +++--
 .../contracts/enums/connectorIcons.tsx        |  2 +-
 5 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
index b043c61e9..a795b61c7 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
@@ -65,7 +65,7 @@ export function DocumentsFilters({
 		<div className="flex select-none">
 			<div className="flex items-center gap-2 w-full">
 				{/* Filter + New Folder Toggle Group */}
-				<ToggleGroup type="multiple" variant="outline" value={[]}>
+				<ToggleGroup type="multiple" variant="outline" value={[]} className="overflow-visible">
 					{onCreateFolder && (
 						<Tooltip>
 							<TooltipTrigger asChild>
@@ -90,11 +90,11 @@ export function DocumentsFilters({
 								<PopoverTrigger asChild>
 									<ToggleGroupItem
 										value="filter"
-										className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
+										className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar overflow-visible"
 									>
 										<ListFilter size={14} />
 										{activeTypes.length > 0 && (
-											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-primary text-[9px] font-medium text-primary-foreground">
+											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-sidebar-border text-[9px] font-medium text-sidebar-foreground">
 												{activeTypes.length}
 											</span>
 										)}
@@ -167,22 +167,6 @@ export function DocumentsFilters({
 										))
 									)}
 								</div>
-								{activeTypes.length > 0 && (
-									<div className="px-3 pt-1.5 pb-1.5 border-t border-border dark:border-neutral-700">
-										<Button
-											variant="ghost"
-											size="sm"
-											className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground hover:bg-neutral-200 dark:hover:bg-neutral-700"
-											onClick={() => {
-												activeTypes.forEach((t) => {
-													onToggleType(t, false);
-												});
-											}}
-										>
-											Clear filters
-										</Button>
-									</div>
-								)}
 							</div>
 						</PopoverContent>
 					</Popover>
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index d9c74e0db..371d00f42 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -96,14 +96,21 @@ export function FolderTreeView({
 	);
 	const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]);
 
+	const effectiveActiveTypes = useMemo(() => {
+		if (activeTypes.includes("FILE" as DocumentTypeEnum) && !activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum)) {
+			return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum];
+		}
+		return activeTypes;
+	}, [activeTypes]);
+
 	const hasDescendantMatch = useMemo(() => {
-		if (activeTypes.length === 0 && !searchQuery) return null;
+		if (effectiveActiveTypes.length === 0 && !searchQuery) return null;
 		const match: Record<number, boolean> = {};
 
 		function check(folderId: number): boolean {
 			if (match[folderId] !== undefined) return match[folderId];
 			const childDocs = (docsByFolder[folderId] ?? []).some(
-				(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
+				(d) => effectiveActiveTypes.length === 0 || effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 			);
 			if (childDocs) {
 				match[folderId] = true;
@@ -124,7 +131,7 @@ export function FolderTreeView({
 			check(f.id);
 		}
 		return match;
-	}, [folders, docsByFolder, foldersByParent, activeTypes, searchQuery]);
+	}, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]);
 
 	const folderSelectionStates = useMemo(() => {
 		const states: Record<number, FolderSelectionState> = {};
@@ -194,7 +201,7 @@ export function FolderTreeView({
 			? childFolders.filter((f) => hasDescendantMatch[f.id])
 			: childFolders;
 		const childDocs = (docsByFolder[key] ?? []).filter(
-			(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
+			(d) => effectiveActiveTypes.length === 0 || effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 		);
 
 		const nodes: React.ReactNode[] = [];
@@ -278,7 +285,7 @@ export function FolderTreeView({
 		);
 	}
 
-	if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) {
+	if (treeNodes.length === 0 && (effectiveActiveTypes.length > 0 || searchQuery)) {
 		return (
 			<div className="flex flex-1 flex-col items-center justify-center gap-3 px-4 py-12 text-muted-foreground">
 				<Search className="h-10 w-10" />
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index 83b7d5d1f..febae35d3 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -84,7 +84,7 @@ export function RightPanelExpandButton() {
 						variant="ghost"
 						size="icon"
 						onClick={() => startTransition(() => setCollapsed(false))}
-						className="h-7 w-7 shrink-0"
+						className="h-8 w-8 shrink-0 -m-0.5"
 					>
 						<PanelRight className="h-4 w-4" />
 						<span className="sr-only">Expand panel</span>
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index aa409e179..8bd7d64ea 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -531,7 +531,8 @@ export function DocumentsSidebar({
 	const typeCounts = useMemo(() => {
 		const counts: Partial<Record<string, number>> = {};
 		for (const d of treeDocuments) {
-			counts[d.document_type] = (counts[d.document_type] || 0) + 1;
+			const displayType = d.document_type === "LOCAL_FOLDER_FILE" ? "FILE" : d.document_type;
+			counts[displayType] = (counts[displayType] || 0) + 1;
 		}
 		return counts;
 	}, [treeDocuments]);
@@ -746,7 +747,7 @@ export function DocumentsSidebar({
 				</button>
 			</div>
 
-			<div className="flex-1 min-h-0 overflow-x-hidden pt-0 flex flex-col">
+			<div className="flex-1 min-h-0 pt-0 flex flex-col">
 				<div className="px-4 pb-2">
 					<DocumentsFilters
 						typeCounts={typeCounts}
diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx
index ab71d58b5..1c6745db5 100644
--- a/surfsense_web/contracts/enums/connectorIcons.tsx
+++ b/surfsense_web/contracts/enums/connectorIcons.tsx
@@ -127,7 +127,7 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
 		case "DEEPEST":
 			return <Telescope {...iconProps} />;
 		case "LOCAL_FOLDER_FILE":
-			return null;
+			return <File {...iconProps} />;
 		default:
 			return <Search {...iconProps} />;
 	}

From be622c417c8b8235c1e46dea41cc8dced53f4159 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:07:26 +0530
Subject: [PATCH 20/37] refactor: update loading skeleton in PlateEditor and
 clean up dark mode styles in various components

---
 surfsense_web/components/editor-panel/editor-panel.tsx   | 3 +--
 surfsense_web/components/ui/floating-toolbar.tsx         | 2 +-
 surfsense_web/components/ui/insert-toolbar-button.tsx    | 2 +-
 surfsense_web/components/ui/slash-node.tsx               | 2 +-
 surfsense_web/components/ui/turn-into-toolbar-button.tsx | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx
index 05bcd2dc0..d4c64570a 100644
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@@ -11,13 +11,12 @@ import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
-import { Skeleton } from "@/components/ui/skeleton";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
 
 const PlateEditor = dynamic(
 	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
-	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
+	{ ssr: false, loading: () => <EditorPanelSkeleton /> }
 );
 
 const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
diff --git a/surfsense_web/components/ui/floating-toolbar.tsx b/surfsense_web/components/ui/floating-toolbar.tsx
index 843e5ecc1..634919474 100644
--- a/surfsense_web/components/ui/floating-toolbar.tsx
+++ b/surfsense_web/components/ui/floating-toolbar.tsx
@@ -65,7 +65,7 @@ export function FloatingToolbar({
 				{...rootProps}
 				ref={ref}
 				className={cn(
-					"scrollbar-hide absolute z-50 overflow-x-auto whitespace-nowrap rounded-md border bg-popover p-1 opacity-100 shadow-md print:hidden dark:bg-neutral-900 dark:border-white/5",
+					"scrollbar-hide absolute z-50 overflow-x-auto whitespace-nowrap rounded-md border bg-popover p-1 opacity-100 shadow-md print:hidden",
 					"max-w-[80vw]",
 					className
 				)}
diff --git a/surfsense_web/components/ui/insert-toolbar-button.tsx b/surfsense_web/components/ui/insert-toolbar-button.tsx
index 1a64f3759..596557aa7 100644
--- a/surfsense_web/components/ui/insert-toolbar-button.tsx
+++ b/surfsense_web/components/ui/insert-toolbar-button.tsx
@@ -189,7 +189,7 @@ export function InsertToolbarButton(props: DropdownMenuProps) {
 			</DropdownMenuTrigger>
 
 			<DropdownMenuContent
-				className="z-[100] flex max-h-[60vh] min-w-0 flex-col overflow-y-auto dark:bg-neutral-900 dark:border dark:border-white/5"
+				className="z-[100] flex max-h-[60vh] min-w-0 flex-col overflow-y-auto"
 				align="start"
 			>
 				{groups.map(({ group, items }) => (
diff --git a/surfsense_web/components/ui/slash-node.tsx b/surfsense_web/components/ui/slash-node.tsx
index d5687efcc..b3fc6f8d6 100644
--- a/surfsense_web/components/ui/slash-node.tsx
+++ b/surfsense_web/components/ui/slash-node.tsx
@@ -176,7 +176,7 @@ export function SlashInputElement({ children, ...props }: PlateElementProps) {
 			<InlineCombobox element={props.element} trigger="/">
 				<InlineComboboxInput />
 
-				<InlineComboboxContent className="dark:bg-neutral-900 dark:border dark:border-white/5">
+				<InlineComboboxContent>
 					<InlineComboboxEmpty>No results found.</InlineComboboxEmpty>
 
 					{slashCommandGroups.map(({ heading, items }) => (
diff --git a/surfsense_web/components/ui/turn-into-toolbar-button.tsx b/surfsense_web/components/ui/turn-into-toolbar-button.tsx
index 64ae5e82e..aa58eef41 100644
--- a/surfsense_web/components/ui/turn-into-toolbar-button.tsx
+++ b/surfsense_web/components/ui/turn-into-toolbar-button.tsx
@@ -150,7 +150,7 @@ export function TurnIntoToolbarButton({
 			</DropdownMenuTrigger>
 
 			<DropdownMenuContent
-				className="z-[100] ignore-click-outside/toolbar min-w-0 max-h-[60vh] overflow-y-auto dark:bg-neutral-900 dark:border dark:border-white/5"
+				className="z-[100] ignore-click-outside/toolbar min-w-0 max-h-[60vh] overflow-y-auto"
 				onCloseAutoFocus={(e) => {
 					e.preventDefault();
 					editor.tf.focus();

From b5a15b7681b05ed1d17b7a34ae2c2769caaed4c9 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 18:36:29 +0530
Subject: [PATCH 21/37] feat: implement cursor-based delta sync for Dropbox
 integration, enhancing file indexing efficiency and preserving folder cursors
 during re-authentication

---
 .../app/connectors/dropbox/client.py          |  49 +++++
 .../app/connectors/dropbox/file_types.py      |  19 +-
 .../app/routes/dropbox_add_connector_route.py |   2 +
 .../app/services/docling_service.py           |   5 +-
 .../connector_indexers/dropbox_indexer.py     | 192 ++++++++++++++++--
 5 files changed, 249 insertions(+), 18 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py
index dfae38f66..b177c2f8d 100644
--- a/surfsense_backend/app/connectors/dropbox/client.py
+++ b/surfsense_backend/app/connectors/dropbox/client.py
@@ -225,6 +225,55 @@ class DropboxClient:
 
         return all_items, None
 
+    async def get_latest_cursor(
+        self, path: str = ""
+    ) -> tuple[str | None, str | None]:
+        """Get a cursor representing the current state of a folder.
+
+        Uses /2/files/list_folder/get_latest_cursor so we can later call
+        get_changes to receive only incremental updates.
+        """
+        resp = await self._request(
+            "/2/files/list_folder/get_latest_cursor",
+            {"path": path, "recursive": False, "include_non_downloadable_files": True},
+        )
+        if resp.status_code != 200:
+            return None, f"Failed to get cursor: {resp.status_code} - {resp.text}"
+        return resp.json().get("cursor"), None
+
+    async def get_changes(
+        self, cursor: str
+    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
+        """Fetch incremental changes since the given cursor.
+
+        Calls /2/files/list_folder/continue and handles pagination.
+        Returns (entries, new_cursor, error).
+        """
+        all_entries: list[dict[str, Any]] = []
+
+        resp = await self._request(
+            "/2/files/list_folder/continue", {"cursor": cursor}
+        )
+        if resp.status_code == 401:
+            return [], None, "Dropbox authentication expired (401)"
+        if resp.status_code != 200:
+            return [], None, f"Failed to get changes: {resp.status_code} - {resp.text}"
+
+        data = resp.json()
+        all_entries.extend(data.get("entries", []))
+
+        while data.get("has_more"):
+            cursor = data["cursor"]
+            resp = await self._request(
+                "/2/files/list_folder/continue", {"cursor": cursor}
+            )
+            if resp.status_code != 200:
+                return all_entries, data.get("cursor"), f"Pagination failed: {resp.status_code}"
+            data = resp.json()
+            all_entries.extend(data.get("entries", []))
+
+        return all_entries, data.get("cursor"), None
+
     async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]:
         resp = await self._request("/2/files/get_metadata", {"path": path})
         if resp.status_code != 200:
diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py
index e6d772a1c..c245e039e 100644
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@@ -2,7 +2,24 @@
 
 PAPER_EXTENSION = ".paper"
 
-SKIP_EXTENSIONS: frozenset[str] = frozenset()
+SKIP_EXTENSIONS: frozenset[str] = frozenset({
+    # Non-universal images (not supported by all 3 ETL pipelines)
+    ".svg", ".gif", ".webp", ".heic", ".ico",
+    ".raw", ".cr2", ".nef", ".arw", ".dng",
+    ".psd", ".ai", ".sketch", ".fig",
+    # Video
+    ".mov", ".avi", ".mkv", ".wmv", ".flv",
+    # Binaries / executables
+    ".exe", ".dll", ".so", ".dylib", ".bin", ".app", ".dmg", ".iso",
+    # Archives
+    ".zip", ".tar", ".gz", ".rar", ".7z", ".bz2",
+    # Fonts
+    ".ttf", ".otf", ".woff", ".woff2",
+    # 3D / CAD
+    ".stl", ".obj", ".fbx", ".blend",
+    # Database
+    ".db", ".sqlite", ".mdb",
+})
 
 MIME_TO_EXTENSION: dict[str, str] = {
     "application/pdf": ".pdf",
diff --git a/surfsense_backend/app/routes/dropbox_add_connector_route.py b/surfsense_backend/app/routes/dropbox_add_connector_route.py
index 941e5c00f..1dba64467 100644
--- a/surfsense_backend/app/routes/dropbox_add_connector_route.py
+++ b/surfsense_backend/app/routes/dropbox_add_connector_route.py
@@ -311,9 +311,11 @@ async def dropbox_callback(
                 )
 
             existing_cursor = db_connector.config.get("cursor")
+            existing_folder_cursors = db_connector.config.get("folder_cursors")
             db_connector.config = {
                 **connector_config,
                 "cursor": existing_cursor,
+                "folder_cursors": existing_folder_cursors,
                 "auth_expired": False,
             }
             flag_modified(db_connector, "config")
diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py
index 82eaf7f74..360c197ed 100644
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@@ -111,9 +111,10 @@ class DoclingService:
                 pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
             )
 
-            # Initialize DocumentConverter
+            # Initialize DocumentConverter with PDF and IMAGE support
             self.converter = DocumentConverter(
-                format_options={InputFormat.PDF: pdf_format_option}
+                allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
+                format_options={InputFormat.PDF: pdf_format_option},
             )
 
             acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index 1b039add7..7a2f82a78 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -250,6 +250,124 @@ async def _download_and_index(
     return batch_indexed, download_failed + batch_failed
 
 
+async def _remove_document(
+    session: AsyncSession, file_id: str, search_space_id: int
+):
+    """Remove a document that was deleted in Dropbox."""
+    primary_hash = compute_identifier_hash(
+        DocumentType.DROPBOX_FILE.value, file_id, search_space_id
+    )
+    existing = await check_document_by_unique_identifier(session, primary_hash)
+
+    if not existing:
+        result = await session.execute(
+            select(Document).where(
+                Document.search_space_id == search_space_id,
+                Document.document_type == DocumentType.DROPBOX_FILE,
+                cast(Document.document_metadata["dropbox_file_id"], String)
+                == file_id,
+            )
+        )
+        existing = result.scalar_one_or_none()
+
+    if existing:
+        await session.delete(existing)
+
+
+async def _index_with_delta_sync(
+    dropbox_client: DropboxClient,
+    session: AsyncSession,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    cursor: str,
+    task_logger: TaskLoggingService,
+    log_entry: object,
+    max_files: int,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
+    enable_summary: bool = True,
+) -> tuple[int, int, str]:
+    """Delta sync using Dropbox cursor-based change tracking.
+
+    Returns (indexed_count, skipped_count, new_cursor).
+    """
+    await task_logger.log_task_progress(
+        log_entry,
+        f"Starting delta sync from cursor: {cursor[:20]}...",
+        {"stage": "delta_sync", "cursor_prefix": cursor[:20]},
+    )
+
+    entries, new_cursor, error = await dropbox_client.get_changes(cursor)
+    if error:
+        err_lower = error.lower()
+        if "401" in error or "authentication expired" in err_lower:
+            raise Exception(
+                f"Dropbox authentication failed. Please re-authenticate. (Error: {error})"
+            )
+        raise Exception(f"Failed to fetch Dropbox changes: {error}")
+
+    if not entries:
+        logger.info("No changes detected since last sync")
+        return 0, 0, new_cursor or cursor
+
+    logger.info(f"Processing {len(entries)} change entries")
+
+    renamed_count = 0
+    skipped = 0
+    files_to_download: list[dict] = []
+    files_processed = 0
+
+    for entry in entries:
+        if files_processed >= max_files:
+            break
+        files_processed += 1
+
+        tag = entry.get(".tag")
+
+        if tag == "deleted":
+            path_lower = entry.get("path_lower", "")
+            name = entry.get("name", "")
+            file_id = entry.get("id", "")
+            if file_id:
+                await _remove_document(session, file_id, search_space_id)
+            logger.debug(f"Processed deletion: {name or path_lower}")
+            continue
+
+        if tag != "file":
+            continue
+
+        if skip_item(entry):
+            skipped += 1
+            continue
+
+        skip, msg = await _should_skip_file(session, entry, search_space_id)
+        if skip:
+            if msg and "renamed" in msg.lower():
+                renamed_count += 1
+            else:
+                skipped += 1
+            continue
+
+        files_to_download.append(entry)
+
+    batch_indexed, failed = await _download_and_index(
+        dropbox_client,
+        session,
+        files_to_download,
+        connector_id=connector_id,
+        search_space_id=search_space_id,
+        user_id=user_id,
+        enable_summary=enable_summary,
+        on_heartbeat=on_heartbeat_callback,
+    )
+
+    indexed = renamed_count + batch_indexed
+    logger.info(
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+    )
+    return indexed, skipped, new_cursor or cursor
+
+
 async def _index_full_scan(
     dropbox_client: DropboxClient,
     session: AsyncSession,
@@ -437,6 +555,9 @@ async def index_dropbox_files(
         max_files = indexing_options.get("max_files", 500)
         incremental_sync = indexing_options.get("incremental_sync", True)
         include_subfolders = indexing_options.get("include_subfolders", True)
+        use_delta_sync = indexing_options.get("use_delta_sync", True)
+
+        folder_cursors: dict = connector.config.get("folder_cursors", {})
 
         total_indexed = 0
         total_skipped = 0
@@ -471,25 +592,66 @@ async def index_dropbox_files(
             )
             folder_name = folder.get("name", "Root")
 
-            logger.info(f"Using full scan for folder {folder_name}")
-            indexed, skipped = await _index_full_scan(
-                dropbox_client,
-                session,
-                connector_id,
-                search_space_id,
-                user_id,
-                folder_path,
-                folder_name,
-                task_logger,
-                log_entry,
-                max_files,
-                include_subfolders,
-                incremental_sync=incremental_sync,
-                enable_summary=connector_enable_summary,
+            saved_cursor = folder_cursors.get(folder_path)
+            can_use_delta = (
+                use_delta_sync
+                and saved_cursor
+                and connector.last_indexed_at
             )
+
+            if can_use_delta:
+                logger.info(f"Using delta sync for folder {folder_name}")
+                indexed, skipped, new_cursor = await _index_with_delta_sync(
+                    dropbox_client,
+                    session,
+                    connector_id,
+                    search_space_id,
+                    user_id,
+                    saved_cursor,
+                    task_logger,
+                    log_entry,
+                    max_files,
+                    enable_summary=connector_enable_summary,
+                )
+                folder_cursors[folder_path] = new_cursor
+            else:
+                logger.info(f"Using full scan for folder {folder_name}")
+                indexed, skipped = await _index_full_scan(
+                    dropbox_client,
+                    session,
+                    connector_id,
+                    search_space_id,
+                    user_id,
+                    folder_path,
+                    folder_name,
+                    task_logger,
+                    log_entry,
+                    max_files,
+                    include_subfolders,
+                    incremental_sync=incremental_sync,
+                    enable_summary=connector_enable_summary,
+                )
+
             total_indexed += indexed
             total_skipped += skipped
 
+            # Persist latest cursor for this folder
+            try:
+                latest_cursor, cursor_err = await dropbox_client.get_latest_cursor(
+                    folder_path
+                )
+                if latest_cursor and not cursor_err:
+                    folder_cursors[folder_path] = latest_cursor
+            except Exception as e:
+                logger.warning(f"Failed to get latest cursor for {folder_path}: {e}")
+
+        # Persist folder cursors to connector config
+        if folders:
+            cfg = dict(connector.config)
+            cfg["folder_cursors"] = folder_cursors
+            connector.config = cfg
+            flag_modified(connector, "config")
+
         if total_indexed > 0 or folders:
             await update_connector_last_indexed(session, connector, True)
 

From caca49177499daa39f4c210fc903333233bafe53 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 18:36:48 +0530
Subject: [PATCH 22/37] test: add unit tests for Dropbox integration, covering
 delta sync methods, file type filtering, and re-authentication behavior

---
 .../test_dropbox_parallel.py                  | 545 ++++++++++++++++++
 .../tests/unit/connectors/__init__.py         |   0
 .../unit/connectors/test_dropbox_client.py    | 115 ++++
 .../connectors/test_dropbox_file_types.py     |  73 +++
 .../unit/connectors/test_dropbox_reauth.py    |  43 ++
 .../tests/unit/services/__init__.py           |   0
 .../services/test_docling_image_support.py    |  67 +++
 7 files changed, 843 insertions(+)
 create mode 100644 surfsense_backend/tests/unit/connectors/__init__.py
 create mode 100644 surfsense_backend/tests/unit/connectors/test_dropbox_client.py
 create mode 100644 surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
 create mode 100644 surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py
 create mode 100644 surfsense_backend/tests/unit/services/__init__.py
 create mode 100644 surfsense_backend/tests/unit/services/test_docling_image_support.py

diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index 76f8806dc..737e2c850 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -8,6 +8,10 @@ import pytest
 from app.db import DocumentType
 from app.tasks.connector_indexers.dropbox_indexer import (
     _download_files_parallel,
+    _index_full_scan,
+    _index_selected_files,
+    _index_with_delta_sync,
+    index_dropbox_files,
 )
 
 pytestmark = pytest.mark.unit
@@ -234,3 +238,544 @@ async def test_heartbeat_fires_during_parallel_downloads(
     assert len(docs) == 3
     assert failed == 0
     assert len(heartbeat_calls) >= 1, "Heartbeat should have fired at least once"
+
+
+# ---------------------------------------------------------------------------
+# D1-D2: _index_full_scan tests
+# ---------------------------------------------------------------------------
+
+
+def _folder_dict(name: str) -> dict:
+    return {".tag": "folder", "name": name}
+
+
+@pytest.fixture
+def full_scan_mocks(mock_dropbox_client, monkeypatch):
+    """Wire up mocks for _index_full_scan in isolation."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    mock_session = AsyncMock()
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_progress = AsyncMock()
+    mock_log_entry = MagicMock()
+
+    skip_results: dict[str, tuple[bool, str | None]] = {}
+
+    async def _fake_skip(session, file, search_space_id):
+        from app.connectors.dropbox.file_types import should_skip_file as _skip
+        if _skip(file):
+            return True, "folder/non-downloadable"
+        return skip_results.get(file.get("id", ""), (False, None))
+
+    monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
+
+    download_and_index_mock = AsyncMock(return_value=(0, 0))
+    monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
+
+    return {
+        "dropbox_client": mock_dropbox_client,
+        "session": mock_session,
+        "task_logger": mock_task_logger,
+        "log_entry": mock_log_entry,
+        "skip_results": skip_results,
+        "download_and_index_mock": download_and_index_mock,
+    }
+
+
+async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    monkeypatch.setattr(
+        _mod,
+        "get_files_in_folder",
+        AsyncMock(return_value=(page_files, None)),
+    )
+    return await _index_full_scan(
+        mocks["dropbox_client"],
+        mocks["session"],
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        "",
+        "Root",
+        mocks["task_logger"],
+        mocks["log_entry"],
+        max_files,
+        enable_summary=True,
+    )
+
+
+async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
+    """Skipped files excluded, renames counted as indexed, new files downloaded."""
+    page_files = [
+        _folder_dict("SubFolder"),
+        _make_file_dict("skip1", "unchanged.txt"),
+        _make_file_dict("rename1", "renamed.txt"),
+        _make_file_dict("new1", "new1.txt"),
+        _make_file_dict("new2", "new2.txt"),
+    ]
+
+    full_scan_mocks["skip_results"]["skip1"] = (True, "unchanged")
+    full_scan_mocks["skip_results"]["rename1"] = (
+        True,
+        "File renamed: 'old' -> 'renamed.txt'",
+    )
+
+    full_scan_mocks["download_and_index_mock"].return_value = (2, 0)
+
+    indexed, skipped = await _run_full_scan(
+        full_scan_mocks, monkeypatch, page_files
+    )
+
+    assert indexed == 3  # 1 renamed + 2 from batch
+    assert skipped == 2  # 1 folder + 1 unchanged
+
+    call_args = full_scan_mocks["download_and_index_mock"].call_args
+    call_files = call_args[0][2]
+    assert len(call_files) == 2
+    assert {f["id"] for f in call_files} == {"new1", "new2"}
+
+
+async def test_full_scan_respects_max_files(full_scan_mocks, monkeypatch):
+    """Only max_files non-folder items are considered."""
+    page_files = [_make_file_dict(f"f{i}", f"file{i}.txt") for i in range(10)]
+
+    full_scan_mocks["download_and_index_mock"].return_value = (3, 0)
+
+    await _run_full_scan(full_scan_mocks, monkeypatch, page_files, max_files=3)
+
+    call_files = full_scan_mocks["download_and_index_mock"].call_args[0][2]
+    assert len(call_files) == 3
+
+
+# ---------------------------------------------------------------------------
+# D3-D5: _index_selected_files tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def selected_files_mocks(mock_dropbox_client, monkeypatch):
+    """Wire up mocks for _index_selected_files tests."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    mock_session = AsyncMock()
+
+    get_file_results: dict[str, tuple[dict | None, str | None]] = {}
+
+    async def _fake_get_file(client, path):
+        return get_file_results.get(path, (None, f"Not configured: {path}"))
+
+    monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file)
+
+    skip_results: dict[str, tuple[bool, str | None]] = {}
+
+    async def _fake_skip(session, file, search_space_id):
+        return skip_results.get(file["id"], (False, None))
+
+    monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
+
+    download_and_index_mock = AsyncMock(return_value=(0, 0))
+    monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
+
+    return {
+        "dropbox_client": mock_dropbox_client,
+        "session": mock_session,
+        "get_file_results": get_file_results,
+        "skip_results": skip_results,
+        "download_and_index_mock": download_and_index_mock,
+    }
+
+
+async def _run_selected(mocks, file_tuples):
+    return await _index_selected_files(
+        mocks["dropbox_client"],
+        mocks["session"],
+        file_tuples,
+        connector_id=_CONNECTOR_ID,
+        search_space_id=_SEARCH_SPACE_ID,
+        user_id=_USER_ID,
+        enable_summary=True,
+    )
+
+
+async def test_selected_files_single_file_indexed(selected_files_mocks):
+    selected_files_mocks["get_file_results"]["/report.pdf"] = (
+        _make_file_dict("f1", "report.pdf"),
+        None,
+    )
+    selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
+
+    indexed, skipped, errors = await _run_selected(
+        selected_files_mocks,
+        [("/report.pdf", "report.pdf")],
+    )
+
+    assert indexed == 1
+    assert skipped == 0
+    assert errors == []
+
+
+async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
+    selected_files_mocks["get_file_results"]["/first.txt"] = (
+        _make_file_dict("f1", "first.txt"),
+        None,
+    )
+    selected_files_mocks["get_file_results"]["/mid.txt"] = (None, "HTTP 404")
+    selected_files_mocks["get_file_results"]["/third.txt"] = (
+        _make_file_dict("f3", "third.txt"),
+        None,
+    )
+    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
+
+    indexed, skipped, errors = await _run_selected(
+        selected_files_mocks,
+        [("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")],
+    )
+
+    assert indexed == 2
+    assert skipped == 0
+    assert len(errors) == 1
+    assert "mid.txt" in errors[0]
+
+
+async def test_selected_files_skip_rename_counting(selected_files_mocks):
+    for path, fid, fname in [
+        ("/unchanged.txt", "s1", "unchanged.txt"),
+        ("/renamed.txt", "r1", "renamed.txt"),
+        ("/new1.txt", "n1", "new1.txt"),
+        ("/new2.txt", "n2", "new2.txt"),
+    ]:
+        selected_files_mocks["get_file_results"][path] = (
+            _make_file_dict(fid, fname),
+            None,
+        )
+
+    selected_files_mocks["skip_results"]["s1"] = (True, "unchanged")
+    selected_files_mocks["skip_results"]["r1"] = (
+        True,
+        "File renamed: 'old' -> 'renamed.txt'",
+    )
+    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
+
+    indexed, skipped, errors = await _run_selected(
+        selected_files_mocks,
+        [
+            ("/unchanged.txt", "unchanged.txt"),
+            ("/renamed.txt", "renamed.txt"),
+            ("/new1.txt", "new1.txt"),
+            ("/new2.txt", "new2.txt"),
+        ],
+    )
+
+    assert indexed == 3  # 1 renamed + 2 batch
+    assert skipped == 1
+    assert errors == []
+
+    mock = selected_files_mocks["download_and_index_mock"]
+    call_files = mock.call_args[0][2]
+    assert len(call_files) == 2
+    assert {f["id"] for f in call_files} == {"n1", "n2"}
+
+
+# ---------------------------------------------------------------------------
+# E1-E4: _index_with_delta_sync tests
+# ---------------------------------------------------------------------------
+
+
+async def test_delta_sync_deletions_call_remove_document(monkeypatch):
+    """E1: deleted entries are processed via _remove_document."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    entries = [
+        {".tag": "deleted", "name": "gone.txt", "path_lower": "/gone.txt", "id": "id:del1"},
+        {".tag": "deleted", "name": "also_gone.pdf", "path_lower": "/also_gone.pdf", "id": "id:del2"},
+    ]
+
+    mock_client = MagicMock()
+    mock_client.get_changes = AsyncMock(return_value=(entries, "new-cursor", None))
+
+    remove_calls: list[str] = []
+
+    async def _fake_remove(session, file_id, search_space_id):
+        remove_calls.append(file_id)
+
+    monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
+    monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
+
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_progress = AsyncMock()
+
+    indexed, skipped, cursor = await _index_with_delta_sync(
+        mock_client,
+        AsyncMock(),
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        "old-cursor",
+        mock_task_logger,
+        MagicMock(),
+        max_files=500,
+        enable_summary=True,
+    )
+
+    assert sorted(remove_calls) == ["id:del1", "id:del2"]
+    assert cursor == "new-cursor"
+
+
+async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
+    """E2: modified/new file entries go through skip filter then download+index."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    entries = [
+        _make_file_dict("mod1", "modified1.txt"),
+        _make_file_dict("mod2", "modified2.txt"),
+    ]
+
+    mock_client = MagicMock()
+    mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
+
+    monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
+
+    download_mock = AsyncMock(return_value=(2, 0))
+    monkeypatch.setattr(_mod, "_download_and_index", download_mock)
+
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_progress = AsyncMock()
+
+    indexed, skipped, cursor = await _index_with_delta_sync(
+        mock_client,
+        AsyncMock(),
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        "cursor-v1",
+        mock_task_logger,
+        MagicMock(),
+        max_files=500,
+        enable_summary=True,
+    )
+
+    assert indexed == 2
+    assert skipped == 0
+    assert cursor == "cursor-v2"
+
+    downloaded_files = download_mock.call_args[0][2]
+    assert len(downloaded_files) == 2
+    assert {f["id"] for f in downloaded_files} == {"mod1", "mod2"}
+
+
+async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
+    """E3: deletions processed, then remaining upserts filtered and indexed."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    entries = [
+        {".tag": "deleted", "name": "removed.txt", "path_lower": "/removed.txt", "id": "id:del1"},
+        {".tag": "deleted", "name": "trashed.pdf", "path_lower": "/trashed.pdf", "id": "id:del2"},
+        _make_file_dict("mod1", "updated.txt"),
+        _make_file_dict("new1", "brandnew.docx"),
+    ]
+
+    mock_client = MagicMock()
+    mock_client.get_changes = AsyncMock(return_value=(entries, "final-cursor", None))
+
+    remove_calls: list[str] = []
+
+    async def _fake_remove(session, file_id, search_space_id):
+        remove_calls.append(file_id)
+
+    monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
+    monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
+
+    download_mock = AsyncMock(return_value=(2, 0))
+    monkeypatch.setattr(_mod, "_download_and_index", download_mock)
+
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_progress = AsyncMock()
+
+    indexed, skipped, cursor = await _index_with_delta_sync(
+        mock_client,
+        AsyncMock(),
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        "old-cursor",
+        mock_task_logger,
+        MagicMock(),
+        max_files=500,
+        enable_summary=True,
+    )
+
+    assert sorted(remove_calls) == ["id:del1", "id:del2"]
+    assert indexed == 2
+    assert skipped == 0
+    assert cursor == "final-cursor"
+
+    downloaded_files = download_mock.call_args[0][2]
+    assert {f["id"] for f in downloaded_files} == {"mod1", "new1"}
+
+
+async def test_delta_sync_returns_new_cursor(monkeypatch):
+    """E4: the new cursor from the API response is returned."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    mock_client = MagicMock()
+    mock_client.get_changes = AsyncMock(return_value=([], "brand-new-cursor-xyz", None))
+
+    monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
+
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_progress = AsyncMock()
+
+    indexed, skipped, cursor = await _index_with_delta_sync(
+        mock_client,
+        AsyncMock(),
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        "old-cursor",
+        mock_task_logger,
+        MagicMock(),
+        max_files=500,
+        enable_summary=True,
+    )
+
+    assert cursor == "brand-new-cursor-xyz"
+    assert indexed == 0
+    assert skipped == 0
+
+
+# ---------------------------------------------------------------------------
+# F1-F3: index_dropbox_files orchestrator tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def orchestrator_mocks(monkeypatch):
+    """Wire up mocks for index_dropbox_files orchestrator tests."""
+    import app.tasks.connector_indexers.dropbox_indexer as _mod
+
+    mock_connector = MagicMock()
+    mock_connector.config = {"_token_encrypted": False}
+    mock_connector.last_indexed_at = None
+    mock_connector.enable_summary = True
+
+    monkeypatch.setattr(
+        _mod,
+        "get_connector_by_id",
+        AsyncMock(return_value=mock_connector),
+    )
+
+    mock_task_logger = MagicMock()
+    mock_task_logger.log_task_start = AsyncMock(return_value=MagicMock())
+    mock_task_logger.log_task_progress = AsyncMock()
+    mock_task_logger.log_task_success = AsyncMock()
+    mock_task_logger.log_task_failure = AsyncMock()
+    monkeypatch.setattr(
+        _mod, "TaskLoggingService", MagicMock(return_value=mock_task_logger)
+    )
+
+    monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock())
+
+    full_scan_mock = AsyncMock(return_value=(5, 2))
+    monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock)
+
+    delta_sync_mock = AsyncMock(return_value=(3, 1, "delta-cursor-new"))
+    monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock)
+
+    mock_client = MagicMock()
+    mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
+    monkeypatch.setattr(
+        _mod, "DropboxClient", MagicMock(return_value=mock_client)
+    )
+
+    return {
+        "connector": mock_connector,
+        "full_scan_mock": full_scan_mock,
+        "delta_sync_mock": delta_sync_mock,
+        "mock_client": mock_client,
+    }
+
+
+async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
+    orchestrator_mocks,
+):
+    """F1: with cursor + last_indexed_at + use_delta_sync, calls delta sync."""
+    from datetime import UTC, datetime
+
+    connector = orchestrator_mocks["connector"]
+    connector.config = {
+        "_token_encrypted": False,
+        "folder_cursors": {"/docs": "saved-cursor-123"},
+    }
+    connector.last_indexed_at = datetime(2026, 1, 1, tzinfo=UTC)
+
+    mock_session = AsyncMock()
+    mock_session.commit = AsyncMock()
+
+    indexed, skipped, error = await index_dropbox_files(
+        mock_session,
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        {
+            "folders": [{"path": "/docs", "name": "Docs"}],
+            "files": [],
+            "indexing_options": {"use_delta_sync": True},
+        },
+    )
+
+    assert error is None
+    orchestrator_mocks["delta_sync_mock"].assert_called_once()
+    orchestrator_mocks["full_scan_mock"].assert_not_called()
+
+
+async def test_orchestrator_falls_back_to_full_scan_without_cursor(
+    orchestrator_mocks,
+):
+    """F2: without cursor, falls back to full scan."""
+    connector = orchestrator_mocks["connector"]
+    connector.config = {"_token_encrypted": False}
+    connector.last_indexed_at = None
+
+    mock_session = AsyncMock()
+    mock_session.commit = AsyncMock()
+
+    indexed, skipped, error = await index_dropbox_files(
+        mock_session,
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        {
+            "folders": [{"path": "/docs", "name": "Docs"}],
+            "files": [],
+            "indexing_options": {"use_delta_sync": True},
+        },
+    )
+
+    assert error is None
+    orchestrator_mocks["full_scan_mock"].assert_called_once()
+    orchestrator_mocks["delta_sync_mock"].assert_not_called()
+
+
+async def test_orchestrator_persists_cursor_after_sync(orchestrator_mocks):
+    """F3: after sync, persists new cursor to connector config."""
+    connector = orchestrator_mocks["connector"]
+    connector.config = {"_token_encrypted": False}
+    connector.last_indexed_at = None
+
+    mock_session = AsyncMock()
+    mock_session.commit = AsyncMock()
+
+    await index_dropbox_files(
+        mock_session,
+        _CONNECTOR_ID,
+        _SEARCH_SPACE_ID,
+        _USER_ID,
+        {
+            "folders": [{"path": "/docs", "name": "Docs"}],
+            "files": [],
+        },
+    )
+
+    assert "folder_cursors" in connector.config
+    assert connector.config["folder_cursors"]["/docs"] == "latest-cursor-abc"
diff --git a/surfsense_backend/tests/unit/connectors/__init__.py b/surfsense_backend/tests/unit/connectors/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
new file mode 100644
index 000000000..efacbcf72
--- /dev/null
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
@@ -0,0 +1,115 @@
+"""Tests for DropboxClient delta-sync methods (get_latest_cursor, get_changes)."""
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.connectors.dropbox.client import DropboxClient
+
+pytestmark = pytest.mark.unit
+
+
+def _make_client() -> DropboxClient:
+    """Create a DropboxClient with a mocked DB session so no real DB needed."""
+    client = DropboxClient.__new__(DropboxClient)
+    client._session = MagicMock()
+    client._connector_id = 1
+    return client
+
+
+# ---------- C1: get_latest_cursor ----------
+
+async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
+    client = _make_client()
+
+    fake_resp = MagicMock()
+    fake_resp.status_code = 200
+    fake_resp.json.return_value = {"cursor": "AAHbKxRZ9enq…"}
+
+    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
+
+    cursor, error = await client.get_latest_cursor("/my-folder")
+
+    assert cursor == "AAHbKxRZ9enq…"
+    assert error is None
+    client._request.assert_called_once_with(
+        "/2/files/list_folder/get_latest_cursor",
+        {"path": "/my-folder", "recursive": False, "include_non_downloadable_files": True},
+    )
+
+
+# ---------- C2: get_changes returns entries and new cursor ----------
+
+async def test_get_changes_returns_entries_and_cursor(monkeypatch):
+    client = _make_client()
+
+    fake_resp = MagicMock()
+    fake_resp.status_code = 200
+    fake_resp.json.return_value = {
+        "entries": [
+            {".tag": "file", "name": "new.txt", "id": "id:abc"},
+            {".tag": "deleted", "name": "old.txt"},
+        ],
+        "cursor": "cursor-v2",
+        "has_more": False,
+    }
+    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
+
+    entries, new_cursor, error = await client.get_changes("cursor-v1")
+
+    assert error is None
+    assert new_cursor == "cursor-v2"
+    assert len(entries) == 2
+    assert entries[0]["name"] == "new.txt"
+    assert entries[1][".tag"] == "deleted"
+
+
+# ---------- C3: get_changes handles pagination ----------
+
+async def test_get_changes_handles_pagination(monkeypatch):
+    client = _make_client()
+
+    page1 = MagicMock()
+    page1.status_code = 200
+    page1.json.return_value = {
+        "entries": [{".tag": "file", "name": "a.txt", "id": "id:a"}],
+        "cursor": "cursor-page2",
+        "has_more": True,
+    }
+    page2 = MagicMock()
+    page2.status_code = 200
+    page2.json.return_value = {
+        "entries": [{".tag": "file", "name": "b.txt", "id": "id:b"}],
+        "cursor": "cursor-final",
+        "has_more": False,
+    }
+
+    request_mock = AsyncMock(side_effect=[page1, page2])
+    monkeypatch.setattr(client, "_request", request_mock)
+
+    entries, new_cursor, error = await client.get_changes("cursor-v1")
+
+    assert error is None
+    assert new_cursor == "cursor-final"
+    assert len(entries) == 2
+    assert {e["name"] for e in entries} == {"a.txt", "b.txt"}
+    assert request_mock.call_count == 2
+
+
+# ---------- C4: get_changes raises on 401 ----------
+
+async def test_get_changes_returns_error_on_401(monkeypatch):
+    client = _make_client()
+
+    fake_resp = MagicMock()
+    fake_resp.status_code = 401
+    fake_resp.text = "Unauthorized"
+
+    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
+
+    entries, new_cursor, error = await client.get_changes("old-cursor")
+
+    assert error is not None
+    assert "401" in error
+    assert entries == []
+    assert new_cursor is None
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
new file mode 100644
index 000000000..5480d8c8a
--- /dev/null
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@@ -0,0 +1,73 @@
+"""Tests for Dropbox file type filtering (should_skip_file)."""
+
+import pytest
+
+from app.connectors.dropbox.file_types import should_skip_file
+
+pytestmark = pytest.mark.unit
+
+
+def test_folder_item_is_skipped():
+    item = {".tag": "folder", "name": "My Folder"}
+    assert should_skip_file(item) is True
+
+
+def test_paper_file_is_not_skipped():
+    item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
+    assert should_skip_file(item) is False
+
+
+def test_non_downloadable_item_is_skipped():
+    item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
+    assert should_skip_file(item) is True
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
+        "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
+        "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
+        "icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
+        "raw.cr2", "photo.nef", "image.arw", "pic.dng",
+        "design.psd", "vector.ai", "mockup.sketch", "proto.fig",
+        "font.ttf", "font.otf", "font.woff", "font.woff2",
+        "model.stl", "scene.fbx", "mesh.blend",
+        "local.db", "data.sqlite", "access.mdb",
+    ],
+)
+def test_non_parseable_extensions_are_skipped(filename):
+    item = {".tag": "file", "name": filename}
+    assert should_skip_file(item) is True, f"{filename} should be skipped"
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
+        "old.doc", "legacy.xls", "deck.ppt",
+        "readme.txt", "data.csv", "page.html", "notes.md",
+        "config.json", "feed.xml",
+    ],
+)
+def test_parseable_documents_are_not_skipped(filename):
+    item = {".tag": "file", "name": filename}
+    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+
+
+@pytest.mark.parametrize(
+    "filename",
+    ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
+)
+def test_universal_images_are_not_skipped(filename):
+    item = {".tag": "file", "name": filename}
+    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+
+
+@pytest.mark.parametrize(
+    "filename",
+    ["icon.svg", "anim.gif", "photo.webp", "live.heic"],
+)
+def test_non_universal_images_are_skipped(filename):
+    item = {".tag": "file", "name": filename}
+    assert should_skip_file(item) is True, f"{filename} should be skipped"
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py
new file mode 100644
index 000000000..85281354c
--- /dev/null
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py
@@ -0,0 +1,43 @@
+"""Test that Dropbox re-auth preserves folder_cursors in connector config."""
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+def test_reauth_preserves_folder_cursors():
+    """G1: re-authentication preserves folder_cursors alongside cursor."""
+    old_config = {
+        "access_token": "old-token-enc",
+        "refresh_token": "old-refresh-enc",
+        "cursor": "old-cursor-abc",
+        "folder_cursors": {"/docs": "cursor-docs-123", "/photos": "cursor-photos-456"},
+        "_token_encrypted": True,
+        "auth_expired": True,
+    }
+
+    new_connector_config = {
+        "access_token": "new-token-enc",
+        "refresh_token": "new-refresh-enc",
+        "token_type": "bearer",
+        "expires_in": 14400,
+        "expires_at": "2026-04-06T16:00:00+00:00",
+        "_token_encrypted": True,
+    }
+
+    existing_cursor = old_config.get("cursor")
+    existing_folder_cursors = old_config.get("folder_cursors")
+    merged_config = {
+        **new_connector_config,
+        "cursor": existing_cursor,
+        "folder_cursors": existing_folder_cursors,
+        "auth_expired": False,
+    }
+
+    assert merged_config["access_token"] == "new-token-enc"
+    assert merged_config["cursor"] == "old-cursor-abc"
+    assert merged_config["folder_cursors"] == {
+        "/docs": "cursor-docs-123",
+        "/photos": "cursor-photos-456",
+    }
+    assert merged_config["auth_expired"] is False
diff --git a/surfsense_backend/tests/unit/services/__init__.py b/surfsense_backend/tests/unit/services/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py
new file mode 100644
index 000000000..071d061e2
--- /dev/null
+++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py
@@ -0,0 +1,67 @@
+"""Test that DoclingService registers InputFormat.IMAGE for image processing."""
+
+from enum import Enum
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+class _FakeInputFormat(Enum):
+    PDF = "pdf"
+    IMAGE = "image"
+
+
+def test_docling_service_registers_image_format():
+    """DoclingService should initialise DocumentConverter with InputFormat.IMAGE
+    in allowed_formats so that image files (jpg, png, bmp, tiff) are accepted."""
+
+    mock_converter_cls = MagicMock()
+    mock_backend = MagicMock()
+
+    fake_pipeline_options_cls = MagicMock()
+    fake_pipeline_options = MagicMock()
+    fake_pipeline_options_cls.return_value = fake_pipeline_options
+
+    fake_pdf_format_option_cls = MagicMock()
+
+    with patch.dict("sys.modules", {
+        "docling": MagicMock(),
+        "docling.backend": MagicMock(),
+        "docling.backend.pypdfium2_backend": MagicMock(
+            PyPdfiumDocumentBackend=mock_backend
+        ),
+        "docling.datamodel": MagicMock(),
+        "docling.datamodel.base_models": MagicMock(
+            InputFormat=_FakeInputFormat
+        ),
+        "docling.datamodel.pipeline_options": MagicMock(
+            PdfPipelineOptions=fake_pipeline_options_cls
+        ),
+        "docling.document_converter": MagicMock(
+            DocumentConverter=mock_converter_cls,
+            PdfFormatOption=fake_pdf_format_option_cls,
+        ),
+    }):
+        import app.services.docling_service as mod
+        from importlib import reload
+        reload(mod)
+
+        mod.DoclingService()
+
+    call_kwargs = mock_converter_cls.call_args
+    assert call_kwargs is not None, "DocumentConverter was never called"
+
+    _, kwargs = call_kwargs
+    allowed = kwargs.get("allowed_formats")
+    format_opts = kwargs.get("format_options", {})
+
+    image_registered = (
+        (allowed is not None and _FakeInputFormat.IMAGE in allowed)
+        or _FakeInputFormat.IMAGE in format_opts
+    )
+    assert image_registered, (
+        f"InputFormat.IMAGE not registered. "
+        f"allowed_formats={allowed}, format_options keys={list(format_opts.keys())}"
+    )

From 47f4be08d971fe96bce911a154d10568d724dde6 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 19:31:42 +0530
Subject: [PATCH 23/37] refactor: remove allowed_formats from DocumentConverter
 initialization in DoclingService to allow acceptance of all supported formats

---
 .../app/services/docling_service.py           |  2 --
 .../services/test_docling_image_support.py    | 26 +++++++++----------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py
index 360c197ed..af9a7d2d5 100644
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@@ -111,9 +111,7 @@ class DoclingService:
                 pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
             )
 
-            # Initialize DocumentConverter with PDF and IMAGE support
             self.converter = DocumentConverter(
-                allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
                 format_options={InputFormat.PDF: pdf_format_option},
             )
 
diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py
index 071d061e2..430adbaf2 100644
--- a/surfsense_backend/tests/unit/services/test_docling_image_support.py
+++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py
@@ -1,4 +1,5 @@
-"""Test that DoclingService registers InputFormat.IMAGE for image processing."""
+"""Test that DoclingService does NOT restrict allowed_formats, letting Docling
+accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
 
 from enum import Enum
 from unittest.mock import MagicMock, patch
@@ -11,11 +12,14 @@ pytestmark = pytest.mark.unit
 class _FakeInputFormat(Enum):
     PDF = "pdf"
     IMAGE = "image"
+    DOCX = "docx"
+    PPTX = "pptx"
+    XLSX = "xlsx"
 
 
-def test_docling_service_registers_image_format():
-    """DoclingService should initialise DocumentConverter with InputFormat.IMAGE
-    in allowed_formats so that image files (jpg, png, bmp, tiff) are accepted."""
+def test_docling_service_does_not_restrict_allowed_formats():
+    """DoclingService should NOT pass allowed_formats to DocumentConverter,
+    so Docling defaults to accepting every InputFormat it supports."""
 
     mock_converter_cls = MagicMock()
     mock_backend = MagicMock()
@@ -54,14 +58,10 @@ def test_docling_service_registers_image_format():
     assert call_kwargs is not None, "DocumentConverter was never called"
 
     _, kwargs = call_kwargs
-    allowed = kwargs.get("allowed_formats")
-    format_opts = kwargs.get("format_options", {})
-
-    image_registered = (
-        (allowed is not None and _FakeInputFormat.IMAGE in allowed)
-        or _FakeInputFormat.IMAGE in format_opts
+    assert "allowed_formats" not in kwargs, (
+        f"allowed_formats should not be passed — let Docling accept all formats. "
+        f"Got: {kwargs.get('allowed_formats')}"
     )
-    assert image_registered, (
-        f"InputFormat.IMAGE not registered. "
-        f"allowed_formats={allowed}, format_options keys={list(format_opts.keys())}"
+    assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
+        "format_options should still configure PDF pipeline options"
     )

From dc7047f64de916bd002fa4e0fc265b8532f92def Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:03:47 +0530
Subject: [PATCH 24/37] refactor: implement file type classification for
 supported extensions across Dropbox, Google Drive, and OneDrive connectors,
 enhancing file handling and error management

---
 .../app/connectors/dropbox/file_types.py      | 24 +------
 .../google_drive/content_extractor.py         |  8 ++-
 .../app/connectors/google_drive/file_types.py |  7 ++
 .../app/connectors/onedrive/file_types.py     |  9 ++-
 .../app/etl_pipeline/etl_pipeline_service.py  |  7 +-
 .../app/etl_pipeline/exceptions.py            |  4 ++
 .../app/etl_pipeline/file_classifier.py       |  6 +-
 .../document_processors/file_processors.py    |  1 +
 .../app/utils/file_extensions.py              | 31 ++++++++
 .../test_google_drive_file_types.py           | 22 ++++++
 .../connectors/test_onedrive_file_types.py    | 44 ++++++++++++
 .../etl_pipeline/test_etl_pipeline_service.py | 72 ++++++++++++++++++-
 .../tests/unit/utils/__init__.py              |  0
 .../tests/unit/utils/test_file_extensions.py  | 42 +++++++++++
 14 files changed, 250 insertions(+), 27 deletions(-)
 create mode 100644 surfsense_backend/app/utils/file_extensions.py
 create mode 100644 surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
 create mode 100644 surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
 create mode 100644 surfsense_backend/tests/unit/utils/__init__.py
 create mode 100644 surfsense_backend/tests/unit/utils/test_file_extensions.py

diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py
index c245e039e..7b72c1857 100644
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@@ -1,25 +1,8 @@
 """File type handlers for Dropbox."""
 
-PAPER_EXTENSION = ".paper"
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
 
-SKIP_EXTENSIONS: frozenset[str] = frozenset({
-    # Non-universal images (not supported by all 3 ETL pipelines)
-    ".svg", ".gif", ".webp", ".heic", ".ico",
-    ".raw", ".cr2", ".nef", ".arw", ".dng",
-    ".psd", ".ai", ".sketch", ".fig",
-    # Video
-    ".mov", ".avi", ".mkv", ".wmv", ".flv",
-    # Binaries / executables
-    ".exe", ".dll", ".so", ".dylib", ".bin", ".app", ".dmg", ".iso",
-    # Archives
-    ".zip", ".tar", ".gz", ".rar", ".7z", ".bz2",
-    # Fonts
-    ".ttf", ".otf", ".woff", ".woff2",
-    # 3D / CAD
-    ".stl", ".obj", ".fbx", ".blend",
-    # Database
-    ".db", ".sqlite", ".mdb",
-})
+PAPER_EXTENSION = ".paper"
 
 MIME_TO_EXTENSION: dict[str, str] = {
     "application/pdf": ".pdf",
@@ -71,5 +54,4 @@ def should_skip_file(item: dict) -> bool:
     if not item.get("is_downloadable", True):
         return True
     name = item.get("name", "")
-    ext = get_extension_from_name(name).lower()
-    return ext in SKIP_EXTENSIONS
+    return classify_file(name) == FileCategory.UNSUPPORTED
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 0c559fee9..10f008594 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -17,6 +17,7 @@ from .file_types import (
     get_export_mime_type,
     get_extension_from_mime,
     is_google_workspace_file,
+    should_skip_by_extension,
     should_skip_file,
 )
 
@@ -42,6 +43,9 @@ async def download_and_extract_content(
     if should_skip_file(mime_type):
         return None, {}, f"Skipping {mime_type}"
 
+    if should_skip_by_extension(file_name):
+        return None, {}, f"Skipping unsupported extension: {file_name}"
+
     logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
 
     drive_metadata: dict[str, Any] = {
@@ -148,10 +152,12 @@ async def download_and_process_file(
     file_name = file.get("name", "Unknown")
     mime_type = file.get("mimeType", "")
 
-    # Skip folders and shortcuts
     if should_skip_file(mime_type):
         return None, f"Skipping {mime_type}", None
 
+    if should_skip_by_extension(file_name):
+        return None, f"Skipping unsupported extension: {file_name}", None
+
     logger.info(f"Downloading file: {file_name} ({mime_type})")
 
     temp_file_path = None
diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py
index dd6aff4d7..e0b8f001e 100644
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@@ -1,5 +1,7 @@
 """File type handlers for Google Drive."""
 
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
 GOOGLE_DOC = "application/vnd.google-apps.document"
 GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
 GOOGLE_SLIDE = "application/vnd.google-apps.presentation"
@@ -46,6 +48,11 @@ def should_skip_file(mime_type: str) -> bool:
     return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
 
 
+def should_skip_by_extension(filename: str) -> bool:
+    """Return True if the file extension is not parseable by any ETL pipeline."""
+    return classify_file(filename) == FileCategory.UNSUPPORTED
+
+
 def get_export_mime_type(mime_type: str) -> str | None:
     """Get export MIME type for Google Workspace files."""
     return EXPORT_FORMATS.get(mime_type)
diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py
index 403fdc337..bcd78b711 100644
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@@ -1,5 +1,7 @@
 """File type handlers for Microsoft OneDrive."""
 
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
 ONEDRIVE_FOLDER_FACET = "folder"
 ONENOTE_MIME = "application/msonenote"
 
@@ -39,7 +41,7 @@ def is_folder(item: dict) -> bool:
 
 
 def should_skip_file(item: dict) -> bool:
-    """Skip folders, OneNote files, remote items (shared links), and packages."""
+    """Skip folders, OneNote files, remote items (shared links), packages, and unsupported extensions."""
     if is_folder(item):
         return True
     if "remoteItem" in item:
@@ -47,4 +49,7 @@ def should_skip_file(item: dict) -> bool:
     if "package" in item:
         return True
     mime = item.get("file", {}).get("mimeType", "")
-    return mime in SKIP_MIME_TYPES
+    if mime in SKIP_MIME_TYPES:
+        return True
+    name = item.get("name", "")
+    return classify_file(name) == FileCategory.UNSUPPORTED
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index f382451df..7c67d2345 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -1,6 +1,6 @@
 from app.config import config as app_config
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
-from app.etl_pipeline.exceptions import EtlServiceUnavailableError
+from app.etl_pipeline.exceptions import EtlServiceUnavailableError, EtlUnsupportedFileError
 from app.etl_pipeline.file_classifier import FileCategory, classify_file
 from app.etl_pipeline.parsers.audio import transcribe_audio
 from app.etl_pipeline.parsers.direct_convert import convert_file_directly
@@ -13,6 +13,11 @@ class EtlPipelineService:
     async def extract(self, request: EtlRequest) -> EtlResult:
         category = classify_file(request.filename)
 
+        if category == FileCategory.UNSUPPORTED:
+            raise EtlUnsupportedFileError(
+                f"File type not supported for parsing: {request.filename}"
+            )
+
         if category == FileCategory.PLAINTEXT:
             content = read_plaintext(request.file_path)
             return EtlResult(
diff --git a/surfsense_backend/app/etl_pipeline/exceptions.py b/surfsense_backend/app/etl_pipeline/exceptions.py
index ac8fc0172..26eecbef4 100644
--- a/surfsense_backend/app/etl_pipeline/exceptions.py
+++ b/surfsense_backend/app/etl_pipeline/exceptions.py
@@ -4,3 +4,7 @@ class EtlParseError(Exception):
 
 class EtlServiceUnavailableError(Exception):
     """Raised when the configured ETL_SERVICE is not recognised."""
+
+
+class EtlUnsupportedFileError(Exception):
+    """Raised when a file type cannot be parsed by any ETL pipeline."""
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
index 40c2d5aff..eea9cce22 100644
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from pathlib import PurePosixPath
 
+from app.utils.file_extensions import DOCUMENT_EXTENSIONS
 
 PLAINTEXT_EXTENSIONS = frozenset(
     {
@@ -35,6 +36,7 @@ class FileCategory(Enum):
     PLAINTEXT = "plaintext"
     AUDIO = "audio"
     DIRECT_CONVERT = "direct_convert"
+    UNSUPPORTED = "unsupported"
     DOCUMENT = "document"
 
 
@@ -46,4 +48,6 @@ def classify_file(filename: str) -> FileCategory:
         return FileCategory.AUDIO
     if suffix in DIRECT_CONVERT_EXTENSIONS:
         return FileCategory.DIRECT_CONVERT
-    return FileCategory.DOCUMENT
+    if suffix in DOCUMENT_EXTENSIONS:
+        return FileCategory.DOCUMENT
+    return FileCategory.UNSUPPORTED
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index f54a963ad..a9a6b62be 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -356,6 +356,7 @@ async def _extract_file_content(
             FileCategory.PLAINTEXT: "Reading file",
             FileCategory.DIRECT_CONVERT: "Converting file",
             FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.UNSUPPORTED: "Unsupported file type",
             FileCategory.DOCUMENT: "Extracting content",
         }
         await NotificationService.document_processing.notify_processing_progress(
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
new file mode 100644
index 000000000..5dac10842
--- /dev/null
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -0,0 +1,31 @@
+"""Allowlist of document extensions the ETL parsers can handle.
+
+Every consumer (file_classifier, connector-level skip checks) imports from
+here so there is a single source of truth.  Extensions already covered by
+PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
+file_classifier are NOT repeated here -- this set is exclusively for the
+"document" ETL path (Docling / LlamaParse / Unstructured).
+"""
+
+from pathlib import PurePosixPath
+
+DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    # PDF
+    ".pdf",
+    # Microsoft Office
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    # Images (raster -- OCR / vision parsing)
+    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
+    # Rich text / e-book
+    ".rtf", ".epub",
+    # OpenDocument
+    ".odt", ".ods", ".odp",
+    # Other (LlamaParse / Unstructured specific)
+    ".hwpx",
+})
+
+
+def is_supported_document_extension(filename: str) -> bool:
+    """Return True if the file's extension is in the supported document set."""
+    suffix = PurePosixPath(filename).suffix.lower()
+    return suffix in DOCUMENT_EXTENSIONS
diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
new file mode 100644
index 000000000..adbad74c2
--- /dev/null
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@@ -0,0 +1,22 @@
+"""Tests for Google Drive file type filtering."""
+
+import pytest
+
+from app.connectors.google_drive.file_types import should_skip_by_extension
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
+])
+def test_unsupported_extensions_are_skipped(filename):
+    assert should_skip_by_extension(filename) is True
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
+    "readme.txt", "data.csv", "photo.png", "notes.md",
+])
+def test_parseable_extensions_are_not_skipped(filename):
+    assert should_skip_by_extension(filename) is False
diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
new file mode 100644
index 000000000..a2491257d
--- /dev/null
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@@ -0,0 +1,44 @@
+"""Tests for OneDrive file type filtering."""
+
+import pytest
+
+from app.connectors.onedrive.file_types import should_skip_file
+
+pytestmark = pytest.mark.unit
+
+
+def test_folder_is_skipped():
+    item = {"folder": {}, "name": "My Folder"}
+    assert should_skip_file(item) is True
+
+
+def test_remote_item_is_skipped():
+    item = {"remoteItem": {}, "name": "shared.docx"}
+    assert should_skip_file(item) is True
+
+
+def test_package_is_skipped():
+    item = {"package": {}, "name": "notebook"}
+    assert should_skip_file(item) is True
+
+
+def test_onenote_is_skipped():
+    item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
+    assert should_skip_file(item) is True
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
+])
+def test_unsupported_extensions_are_skipped(filename):
+    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
+    assert should_skip_file(item) is True, f"{filename} should be skipped"
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
+    "readme.txt", "data.csv", "photo.png", "notes.md",
+])
+def test_parseable_files_are_not_skipped(filename):
+    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
+    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index 0d31507ca..facf15eab 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -257,7 +257,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
 
 
 async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
-    """An unknown extension (e.g. .docx) falls through to the document ETL path."""
+    """An allowlisted document extension (.docx) routes to the document ETL path."""
     docx_file = tmp_path / "doc.docx"
     docx_file.write_bytes(b"PK fake docx")
 
@@ -307,3 +307,73 @@ async def test_unknown_etl_service_raises(tmp_path, mocker):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(pdf_file), filename="report.pdf")
         )
+
+
+# ---------------------------------------------------------------------------
+# Slice 13 – unsupported file types are rejected before reaching any parser
+# ---------------------------------------------------------------------------
+
+
+def test_unknown_extension_classified_as_unsupported():
+    """An unknown extension defaults to UNSUPPORTED (allowlist behaviour)."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2",
+    "model.blend", "data.parquet", "package.deb", "firmware.bin",
+])
+def test_unsupported_extensions_classified_correctly(filename):
+    """Extensions not in any allowlist are classified as UNSUPPORTED."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    assert classify_file(filename) == FileCategory.UNSUPPORTED
+
+
+@pytest.mark.parametrize("filename,expected", [
+    ("report.pdf", "document"),
+    ("doc.docx", "document"),
+    ("slides.pptx", "document"),
+    ("sheet.xlsx", "document"),
+    ("photo.png", "document"),
+    ("photo.jpg", "document"),
+    ("book.epub", "document"),
+    ("letter.odt", "document"),
+    ("readme.md", "plaintext"),
+    ("data.csv", "direct_convert"),
+])
+def test_parseable_extensions_classified_correctly(filename, expected):
+    """Parseable files are classified into their correct category."""
+    from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+    result = classify_file(filename)
+    assert result != FileCategory.UNSUPPORTED
+    assert result.value == expected
+
+
+async def test_extract_unsupported_file_raises_error(tmp_path):
+    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    exe_file = tmp_path / "program.exe"
+    exe_file.write_bytes(b"\x00" * 10)
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(exe_file), filename="program.exe")
+        )
+
+
+async def test_extract_zip_raises_unsupported_error(tmp_path):
+    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    zip_file = tmp_path / "archive.zip"
+    zip_file.write_bytes(b"PK\x03\x04")
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(zip_file), filename="archive.zip")
+        )
diff --git a/surfsense_backend/tests/unit/utils/__init__.py b/surfsense_backend/tests/unit/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py
new file mode 100644
index 000000000..a376f44bd
--- /dev/null
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@@ -0,0 +1,42 @@
+"""Tests for the DOCUMENT_EXTENSIONS allowlist module."""
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+def test_pdf_is_supported_document():
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension("report.pdf") is True
+
+
+def test_exe_is_not_supported_document():
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension("malware.exe") is False
+
+
+@pytest.mark.parametrize("filename", [
+    "report.pdf", "doc.docx", "old.doc",
+    "sheet.xlsx", "legacy.xls",
+    "slides.pptx", "deck.ppt",
+    "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
+    "manual.rtf", "book.epub",
+    "letter.odt", "data.ods", "presentation.odp",
+    "korean.hwpx",
+])
+def test_document_extensions_are_supported(filename):
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension(filename) is True, f"{filename} should be supported"
+
+
+@pytest.mark.parametrize("filename", [
+    "malware.exe", "archive.zip", "video.mov", "font.woff2",
+    "model.blend", "random.xyz", "data.parquet", "package.deb",
+])
+def test_non_document_extensions_are_not_supported(filename):
+    from app.utils.file_extensions import is_supported_document_extension
+
+    assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"

From 0fb92b7c566cb48f95501dcf8f9a1ec6ea31b3d0 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:17:50 +0530
Subject: [PATCH 25/37] refactor: streamline file skipping logic in Dropbox
 indexer by removing redundant checks, improving code clarity

---
 .../app/tasks/connector_indexers/dropbox_indexer.py           | 4 ----
 surfsense_backend/app/utils/file_extensions.py                | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index ae46485cb..d116cc264 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -337,10 +337,6 @@ async def _index_with_delta_sync(
         if tag != "file":
             continue
 
-        if skip_item(entry):
-            skipped += 1
-            continue
-
         skip, msg = await _should_skip_file(session, entry, search_space_id)
         if skip:
             if msg and "renamed" in msg.lower():
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
index 5dac10842..b0a4c808c 100644
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -14,7 +14,7 @@ DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
     ".pdf",
     # Microsoft Office
     ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
-    # Images (raster -- OCR / vision parsing)
+    # Images (raster: OCR / vision parsing)
     ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
     # Rich text / e-book
     ".rtf", ".epub",

From f03bf05aaa6269cbcf937f55d58c779c249b5614 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:34:49 +0530
Subject: [PATCH 26/37] refactor: enhance Google Drive indexer to support file
 extension filtering, improving file handling and error reporting

---
 .../google_drive_indexer.py                   |  7 +++-
 .../integration/document_upload/conftest.py   | 20 ++++-------
 .../test_dropbox_parallel.py                  | 34 +++++++++++++++++++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index 5e9e0f62f..9c53092f5 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -25,7 +25,10 @@ from app.connectors.google_drive import (
     get_files_in_folder,
     get_start_page_token,
 )
-from app.connectors.google_drive.file_types import should_skip_file as skip_mime
+from app.connectors.google_drive.file_types import (
+    should_skip_by_extension,
+    should_skip_file as skip_mime,
+)
 from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
 from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
@@ -78,6 +81,8 @@ async def _should_skip_file(
 
     if skip_mime(mime_type):
         return True, "folder/shortcut"
+    if should_skip_by_extension(file_name):
+        return True, "unsupported extension"
     if not file_id:
         return True, "missing file_id"
 
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index 41c379e58..62f4f6b47 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch):
 
     # -- LlamaParse mock (external API) --------------------------------
 
-    class _FakeMarkdownDoc:
-        def __init__(self, text: str):
-            self.text = text
-
-    class _FakeLlamaParseResult:
-        async def aget_markdown_documents(self, *, split_by_page=False):
-            return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)]
-
-    async def _fake_llamacloud_parse(**kwargs):
-        _reject_empty(kwargs["file_path"])
-        return _FakeLlamaParseResult()
+    async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
+        _reject_empty(file_path)
+        return _MOCK_ETL_MARKDOWN
 
     monkeypatch.setattr(
-        "app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
+        "app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud",
         _fake_llamacloud_parse,
     )
 
     # -- Docling mock (heavy library boundary) -------------------------
 
-    async def _fake_docling_parse(file_path: str, filename: str):
+    async def _fake_docling_parse(file_path: str, filename: str) -> str:
         _reject_empty(file_path)
         return _MOCK_ETL_MARKDOWN
 
     monkeypatch.setattr(
-        "app.tasks.document_processors.file_processors.parse_with_docling",
+        "app.etl_pipeline.parsers.docling.parse_with_docling",
         _fake_docling_parse,
     )
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index 737e2c850..7a828b9c4 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -272,6 +272,23 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
     download_and_index_mock = AsyncMock(return_value=(0, 0))
     monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
 
+    from app.services.page_limit_service import PageLimitService as _RealPLS
+
+    mock_page_limit_instance = MagicMock()
+    mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
+    mock_page_limit_instance.update_page_usage = AsyncMock()
+
+    class _MockPageLimitService:
+        estimate_pages_from_metadata = staticmethod(
+            _RealPLS.estimate_pages_from_metadata
+        )
+
+        def __init__(self, session):
+            self.get_page_usage = mock_page_limit_instance.get_page_usage
+            self.update_page_usage = mock_page_limit_instance.update_page_usage
+
+    monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
+
     return {
         "dropbox_client": mock_dropbox_client,
         "session": mock_session,
@@ -377,6 +394,23 @@ def selected_files_mocks(mock_dropbox_client, monkeypatch):
     download_and_index_mock = AsyncMock(return_value=(0, 0))
     monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
 
+    from app.services.page_limit_service import PageLimitService as _RealPLS
+
+    mock_page_limit_instance = MagicMock()
+    mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
+    mock_page_limit_instance.update_page_usage = AsyncMock()
+
+    class _MockPageLimitService:
+        estimate_pages_from_metadata = staticmethod(
+            _RealPLS.estimate_pages_from_metadata
+        )
+
+        def __init__(self, session):
+            self.get_page_usage = mock_page_limit_instance.get_page_usage
+            self.update_page_usage = mock_page_limit_instance.update_page_usage
+
+    monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
+
     return {
         "dropbox_client": mock_dropbox_client,
         "session": mock_session,

From e7beeb2a3600b657d9d585d6ce3be0d7bc53b224 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 02:19:31 +0530
Subject: [PATCH 27/37] refactor: unify file skipping logic across Dropbox,
 Google Drive, and OneDrive connectors by replacing classification checks with
 a centralized service-based approach, enhancing maintainability and
 consistency in file handling

---
 .../app/connectors/dropbox/file_types.py      |  6 +-
 .../app/connectors/google_drive/file_types.py |  8 +-
 .../app/connectors/onedrive/file_types.py     |  6 +-
 .../app/etl_pipeline/etl_pipeline_service.py  | 11 +++
 .../app/etl_pipeline/file_classifier.py       | 19 ++++-
 .../app/utils/file_extensions.py              | 76 ++++++++++++++-----
 .../test_dropbox_parallel.py                  |  2 +
 .../connectors/test_dropbox_file_types.py     | 73 ++++++++++++++----
 .../test_google_drive_file_types.py           | 33 +++++++-
 .../connectors/test_onedrive_file_types.py    | 37 ++++++++-
 .../etl_pipeline/test_etl_pipeline_service.py | 69 +++++++++++++++++
 .../tests/unit/utils/test_file_extensions.py  | 76 ++++++++++++++++++-
 .../components/sources/DocumentUploadTab.tsx  | 39 +++++-----
 13 files changed, 388 insertions(+), 67 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py
index 7b72c1857..13209ffd2 100644
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@@ -1,6 +1,6 @@
 """File type handlers for Dropbox."""
 
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 
 PAPER_EXTENSION = ".paper"
 
@@ -53,5 +53,7 @@ def should_skip_file(item: dict) -> bool:
         return False
     if not item.get("is_downloadable", True):
         return True
+    from app.config import config as app_config
+
     name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py
index e0b8f001e..73f016ceb 100644
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@@ -1,6 +1,6 @@
 """File type handlers for Google Drive."""
 
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 
 GOOGLE_DOC = "application/vnd.google-apps.document"
 GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
@@ -49,8 +49,10 @@ def should_skip_file(mime_type: str) -> bool:
 
 
 def should_skip_by_extension(filename: str) -> bool:
-    """Return True if the file extension is not parseable by any ETL pipeline."""
-    return classify_file(filename) == FileCategory.UNSUPPORTED
+    """Return True if the file extension is not parseable by the configured ETL service."""
+    from app.config import config as app_config
+
+    return should_skip_for_service(filename, app_config.ETL_SERVICE)
 
 
 def get_export_mime_type(mime_type: str) -> str | None:
diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py
index bcd78b711..f9c147da8 100644
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@@ -1,6 +1,6 @@
 """File type handlers for Microsoft OneDrive."""
 
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
+from app.etl_pipeline.file_classifier import should_skip_for_service
 
 ONEDRIVE_FOLDER_FACET = "folder"
 ONENOTE_MIME = "application/msonenote"
@@ -51,5 +51,7 @@ def should_skip_file(item: dict) -> bool:
     mime = item.get("file", {}).get("mimeType", "")
     if mime in SKIP_MIME_TYPES:
         return True
+    from app.config import config as app_config
+
     name = item.get("name", "")
-    return classify_file(name) == FileCategory.UNSUPPORTED
+    return should_skip_for_service(name, app_config.ETL_SERVICE)
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index 7c67d2345..a0041c843 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -45,6 +45,10 @@ class EtlPipelineService:
         return await self._extract_document(request)
 
     async def _extract_document(self, request: EtlRequest) -> EtlResult:
+        from pathlib import PurePosixPath
+
+        from app.utils.file_extensions import get_document_extensions_for_service
+
         etl_service = app_config.ETL_SERVICE
         if not etl_service:
             raise EtlServiceUnavailableError(
@@ -52,6 +56,13 @@ class EtlPipelineService:
                 "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
             )
 
+        ext = PurePosixPath(request.filename).suffix.lower()
+        supported = get_document_extensions_for_service(etl_service)
+        if ext not in supported:
+            raise EtlUnsupportedFileError(
+                f"File type {ext} is not supported by {etl_service}"
+            )
+
         if etl_service == "DOCLING":
             from app.etl_pipeline.parsers.docling import parse_with_docling
 
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
index eea9cce22..bc7b4537c 100644
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -1,7 +1,7 @@
 from enum import Enum
 from pathlib import PurePosixPath
 
-from app.utils.file_extensions import DOCUMENT_EXTENSIONS
+from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
 
 PLAINTEXT_EXTENSIONS = frozenset(
     {
@@ -29,7 +29,7 @@ AUDIO_EXTENSIONS = frozenset(
     {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
 )
 
-DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
+DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
 
 
 class FileCategory(Enum):
@@ -51,3 +51,18 @@ def classify_file(filename: str) -> FileCategory:
     if suffix in DOCUMENT_EXTENSIONS:
         return FileCategory.DOCUMENT
     return FileCategory.UNSUPPORTED
+
+
+def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
+    """Return True if *filename* cannot be processed by *etl_service*.
+
+    Plaintext, audio, and direct-convert files are parser-agnostic and never
+    skipped.  Document files are checked against the per-parser extension set.
+    """
+    category = classify_file(filename)
+    if category == FileCategory.UNSUPPORTED:
+        return True
+    if category == FileCategory.DOCUMENT:
+        suffix = PurePosixPath(filename).suffix.lower()
+        return suffix not in get_document_extensions_for_service(etl_service)
+    return False
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
index b0a4c808c..5eed36872 100644
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -1,29 +1,69 @@
-"""Allowlist of document extensions the ETL parsers can handle.
+"""Per-parser document extension sets for the ETL pipeline.
 
-Every consumer (file_classifier, connector-level skip checks) imports from
-here so there is a single source of truth.  Extensions already covered by
-PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in
-file_classifier are NOT repeated here -- this set is exclusively for the
-"document" ETL path (Docling / LlamaParse / Unstructured).
+Every consumer (file_classifier, connector-level skip checks, ETL pipeline
+validation) imports from here so there is a single source of truth.
+
+Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
+DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
+sets are exclusively for the "document" ETL path (Docling / LlamaParse /
+Unstructured).
 """
 
 from pathlib import PurePosixPath
 
-DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
-    # PDF
+# ---------------------------------------------------------------------------
+# Per-parser document extension sets (from official documentation)
+# ---------------------------------------------------------------------------
+
+DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
     ".pdf",
-    # Microsoft Office
-    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
-    # Images (raster: OCR / vision parsing)
-    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif",
-    # Rich text / e-book
-    ".rtf", ".epub",
-    # OpenDocument
-    ".odt", ".ods", ".odp",
-    # Other (LlamaParse / Unstructured specific)
-    ".hwpx",
+    ".docx", ".xlsx", ".pptx",
+    ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
 })
 
+LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    ".pdf",
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
+    ".xlsm", ".xlsb", ".xlw",
+    ".rtf", ".epub",
+    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
+    ".odt", ".ods", ".odp",
+    ".hwp", ".hwpx",
+})
+
+UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
+    ".pdf",
+    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
+    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
+    ".rtf", ".epub", ".odt",
+    ".eml", ".msg", ".p7s",
+})
+
+# ---------------------------------------------------------------------------
+# Union (used by classify_file for routing) + service lookup
+# ---------------------------------------------------------------------------
+
+DOCUMENT_EXTENSIONS: frozenset[str] = (
+    DOCLING_DOCUMENT_EXTENSIONS
+    | LLAMAPARSE_DOCUMENT_EXTENSIONS
+    | UNSTRUCTURED_DOCUMENT_EXTENSIONS
+)
+
+_SERVICE_MAP: dict[str, frozenset[str]] = {
+    "DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
+    "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
+    "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
+}
+
+
+def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
+    """Return the document extensions supported by *etl_service*.
+
+    Falls back to the full union when the service is ``None`` or unknown.
+    """
+    return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
+
 
 def is_supported_document_extension(filename: str) -> bool:
     """Return True if the file's extension is in the supported document set."""
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index 7a828b9c4..8572fa8ea 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -261,6 +261,8 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
 
     skip_results: dict[str, tuple[bool, str | None]] = {}
 
+    monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
+
     async def _fake_skip(session, file, search_space_id):
         from app.connectors.dropbox.file_types import should_skip_file as _skip
         if _skip(file):
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
index 5480d8c8a..e092872c5 100644
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@@ -7,6 +7,11 @@ from app.connectors.dropbox.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 
 
+# ---------------------------------------------------------------------------
+# Structural skips (independent of ETL service)
+# ---------------------------------------------------------------------------
+
+
 def test_folder_item_is_skipped():
     item = {".tag": "folder", "name": "My Folder"}
     assert should_skip_file(item) is True
@@ -22,13 +27,18 @@ def test_non_downloadable_item_is_skipped():
     assert should_skip_file(item) is True
 
 
+# ---------------------------------------------------------------------------
+# Extension-based skips (require ETL service context)
+# ---------------------------------------------------------------------------
+
+
 @pytest.mark.parametrize(
     "filename",
     [
         "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
         "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
         "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
-        "icon.svg", "anim.gif", "photo.webp", "shot.heic", "favicon.ico",
+        "favicon.ico",
         "raw.cr2", "photo.nef", "image.arw", "pic.dng",
         "design.psd", "vector.ai", "mockup.sketch", "proto.fig",
         "font.ttf", "font.otf", "font.woff", "font.woff2",
@@ -36,7 +46,8 @@ def test_non_downloadable_item_is_skipped():
         "local.db", "data.sqlite", "access.mdb",
     ],
 )
-def test_non_parseable_extensions_are_skipped(filename):
+def test_non_parseable_extensions_are_skipped(filename, mocker):
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
     item = {".tag": "file", "name": filename}
     assert should_skip_file(item) is True, f"{filename} should be skipped"
 
@@ -45,29 +56,61 @@ def test_non_parseable_extensions_are_skipped(filename):
     "filename",
     [
         "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
-        "old.doc", "legacy.xls", "deck.ppt",
         "readme.txt", "data.csv", "page.html", "notes.md",
         "config.json", "feed.xml",
     ],
 )
-def test_parseable_documents_are_not_skipped(filename):
-    item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+def test_parseable_documents_are_not_skipped(filename, mocker):
+    """Files in plaintext/direct_convert/universal document sets are never skipped."""
+    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
+        mocker.patch("app.config.config.ETL_SERVICE", service)
+        item = {".tag": "file", "name": filename}
+        assert should_skip_file(item) is False, (
+            f"{filename} should NOT be skipped with {service}"
+        )
 
 
 @pytest.mark.parametrize(
     "filename",
     ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
 )
-def test_universal_images_are_not_skipped(filename):
-    item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+def test_universal_images_are_not_skipped(filename, mocker):
+    """Images supported by all parsers are never skipped."""
+    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
+        mocker.patch("app.config.config.ETL_SERVICE", service)
+        item = {".tag": "file", "name": filename}
+        assert should_skip_file(item) is False, (
+            f"{filename} should NOT be skipped with {service}"
+        )
 
 
-@pytest.mark.parametrize(
-    "filename",
-    ["icon.svg", "anim.gif", "photo.webp", "live.heic"],
-)
-def test_non_universal_images_are_skipped(filename):
+@pytest.mark.parametrize("filename,service,expected_skip", [
+    ("old.doc", "DOCLING", True),
+    ("old.doc", "LLAMACLOUD", False),
+    ("old.doc", "UNSTRUCTURED", False),
+    ("legacy.xls", "DOCLING", True),
+    ("legacy.xls", "LLAMACLOUD", False),
+    ("legacy.xls", "UNSTRUCTURED", False),
+    ("deck.ppt", "DOCLING", True),
+    ("deck.ppt", "LLAMACLOUD", False),
+    ("deck.ppt", "UNSTRUCTURED", False),
+    ("icon.svg", "DOCLING", True),
+    ("icon.svg", "LLAMACLOUD", False),
+    ("anim.gif", "DOCLING", True),
+    ("anim.gif", "LLAMACLOUD", False),
+    ("photo.webp", "DOCLING", False),
+    ("photo.webp", "LLAMACLOUD", False),
+    ("photo.webp", "UNSTRUCTURED", True),
+    ("live.heic", "DOCLING", True),
+    ("live.heic", "UNSTRUCTURED", False),
+    ("macro.docm", "DOCLING", True),
+    ("macro.docm", "LLAMACLOUD", False),
+    ("mail.eml", "DOCLING", True),
+    ("mail.eml", "UNSTRUCTURED", False),
+])
+def test_parser_specific_extensions(filename, service, expected_skip, mocker):
+    mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is True, f"{filename} should be skipped"
+    assert should_skip_file(item) is expected_skip, (
+        f"{filename} with {service}: expected skip={expected_skip}"
+    )
diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
index adbad74c2..4ed7eb4db 100644
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@@ -10,13 +10,38 @@ pytestmark = pytest.mark.unit
 @pytest.mark.parametrize("filename", [
     "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
 ])
-def test_unsupported_extensions_are_skipped(filename):
-    assert should_skip_by_extension(filename) is True
+def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
+    """Truly unsupported files are skipped no matter which ETL service is configured."""
+    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
+        mocker.patch("app.config.config.ETL_SERVICE", service)
+        assert should_skip_by_extension(filename) is True
 
 
 @pytest.mark.parametrize("filename", [
     "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
     "readme.txt", "data.csv", "photo.png", "notes.md",
 ])
-def test_parseable_extensions_are_not_skipped(filename):
-    assert should_skip_by_extension(filename) is False
+def test_universal_extensions_are_not_skipped(filename, mocker):
+    """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
+    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
+        mocker.patch("app.config.config.ETL_SERVICE", service)
+        assert should_skip_by_extension(filename) is False, (
+            f"{filename} should NOT be skipped with {service}"
+        )
+
+
+@pytest.mark.parametrize("filename,service,expected_skip", [
+    ("macro.docm", "DOCLING", True),
+    ("macro.docm", "LLAMACLOUD", False),
+    ("mail.eml", "DOCLING", True),
+    ("mail.eml", "UNSTRUCTURED", False),
+    ("photo.gif", "DOCLING", True),
+    ("photo.gif", "LLAMACLOUD", False),
+    ("photo.heic", "UNSTRUCTURED", False),
+    ("photo.heic", "DOCLING", True),
+])
+def test_parser_specific_extensions(filename, service, expected_skip, mocker):
+    mocker.patch("app.config.config.ETL_SERVICE", service)
+    assert should_skip_by_extension(filename) is expected_skip, (
+        f"{filename} with {service}: expected skip={expected_skip}"
+    )
diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
index a2491257d..e73f799e2 100644
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@@ -7,6 +7,11 @@ from app.connectors.onedrive.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 
 
+# ---------------------------------------------------------------------------
+# Structural skips (independent of ETL service)
+# ---------------------------------------------------------------------------
+
+
 def test_folder_is_skipped():
     item = {"folder": {}, "name": "My Folder"}
     assert should_skip_file(item) is True
@@ -27,10 +32,16 @@ def test_onenote_is_skipped():
     assert should_skip_file(item) is True
 
 
+# ---------------------------------------------------------------------------
+# Extension-based skips (require ETL service context)
+# ---------------------------------------------------------------------------
+
+
 @pytest.mark.parametrize("filename", [
     "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
 ])
-def test_unsupported_extensions_are_skipped(filename):
+def test_unsupported_extensions_are_skipped(filename, mocker):
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
     assert should_skip_file(item) is True, f"{filename} should be skipped"
 
@@ -39,6 +50,26 @@ def test_unsupported_extensions_are_skipped(filename):
     "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
     "readme.txt", "data.csv", "photo.png", "notes.md",
 ])
-def test_parseable_files_are_not_skipped(filename):
+def test_universal_files_are_not_skipped(filename, mocker):
+    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
+        mocker.patch("app.config.config.ETL_SERVICE", service)
+        item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
+        assert should_skip_file(item) is False, (
+            f"{filename} should NOT be skipped with {service}"
+        )
+
+
+@pytest.mark.parametrize("filename,service,expected_skip", [
+    ("macro.docm", "DOCLING", True),
+    ("macro.docm", "LLAMACLOUD", False),
+    ("mail.eml", "DOCLING", True),
+    ("mail.eml", "UNSTRUCTURED", False),
+    ("photo.heic", "UNSTRUCTURED", False),
+    ("photo.heic", "DOCLING", True),
+])
+def test_parser_specific_extensions(filename, service, expected_skip, mocker):
+    mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
-    assert should_skip_file(item) is False, f"{filename} should NOT be skipped"
+    assert should_skip_file(item) is expected_skip, (
+        f"{filename} with {service}: expected skip={expected_skip}"
+    )
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index facf15eab..e90847e3a 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -377,3 +377,72 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(zip_file), filename="archive.zip")
         )
+
+
+# ---------------------------------------------------------------------------
+# Slice 14 – should_skip_for_service (per-parser document filtering)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("filename,etl_service,expected_skip", [
+    ("file.eml", "DOCLING", True),
+    ("file.eml", "UNSTRUCTURED", False),
+    ("file.docm", "LLAMACLOUD", False),
+    ("file.docm", "DOCLING", True),
+    ("file.txt", "DOCLING", False),
+    ("file.csv", "LLAMACLOUD", False),
+    ("file.mp3", "UNSTRUCTURED", False),
+    ("file.exe", "LLAMACLOUD", True),
+    ("file.pdf", "DOCLING", False),
+    ("file.webp", "DOCLING", False),
+    ("file.webp", "UNSTRUCTURED", True),
+    ("file.gif", "LLAMACLOUD", False),
+    ("file.gif", "DOCLING", True),
+    ("file.heic", "UNSTRUCTURED", False),
+    ("file.heic", "DOCLING", True),
+    ("file.svg", "LLAMACLOUD", False),
+    ("file.svg", "DOCLING", True),
+    ("file.p7s", "UNSTRUCTURED", False),
+    ("file.p7s", "LLAMACLOUD", True),
+])
+def test_should_skip_for_service(filename, etl_service, expected_skip):
+    from app.etl_pipeline.file_classifier import should_skip_for_service
+
+    assert should_skip_for_service(filename, etl_service) is expected_skip, (
+        f"{filename} with {etl_service}: expected skip={expected_skip}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Slice 14b – ETL pipeline rejects per-parser incompatible documents
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
+    """Docling cannot parse .docm -- pipeline should reject before dispatching."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    docm_file = tmp_path / "macro.docm"
+    docm_file.write_bytes(b"\x00" * 10)
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(docm_file), filename="macro.docm")
+        )
+
+
+async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
+    """Docling cannot parse .eml -- pipeline should reject before dispatching."""
+    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    eml_file = tmp_path / "mail.eml"
+    eml_file.write_bytes(b"From: test@example.com")
+
+    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
+        await EtlPipelineService().extract(
+            EtlRequest(file_path=str(eml_file), filename="mail.eml")
+        )
diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py
index a376f44bd..acd8945ce 100644
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@@ -21,10 +21,17 @@ def test_exe_is_not_supported_document():
     "report.pdf", "doc.docx", "old.doc",
     "sheet.xlsx", "legacy.xls",
     "slides.pptx", "deck.ppt",
+    "macro.docm", "macro.xlsm", "macro.pptm",
     "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
+    "photo.webp", "anim.gif", "iphone.heic",
     "manual.rtf", "book.epub",
     "letter.odt", "data.ods", "presentation.odp",
-    "korean.hwpx",
+    "inbox.eml", "outlook.msg",
+    "korean.hwpx", "korean.hwp",
+    "template.dot", "template.dotm",
+    "template.pot", "template.potx",
+    "binary.xlsb", "workspace.xlw",
+    "vector.svg", "signature.p7s",
 ])
 def test_document_extensions_are_supported(filename):
     from app.utils.file_extensions import is_supported_document_extension
@@ -40,3 +47,70 @@ def test_non_document_extensions_are_not_supported(filename):
     from app.utils.file_extensions import is_supported_document_extension
 
     assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
+
+
+# ---------------------------------------------------------------------------
+# Per-parser extension sets
+# ---------------------------------------------------------------------------
+
+
+def test_union_equals_all_three_sets():
+    from app.utils.file_extensions import (
+        DOCLING_DOCUMENT_EXTENSIONS,
+        DOCUMENT_EXTENSIONS,
+        LLAMAPARSE_DOCUMENT_EXTENSIONS,
+        UNSTRUCTURED_DOCUMENT_EXTENSIONS,
+    )
+
+    expected = (
+        DOCLING_DOCUMENT_EXTENSIONS
+        | LLAMAPARSE_DOCUMENT_EXTENSIONS
+        | UNSTRUCTURED_DOCUMENT_EXTENSIONS
+    )
+    assert DOCUMENT_EXTENSIONS == expected
+
+
+def test_get_extensions_for_docling():
+    from app.utils.file_extensions import get_document_extensions_for_service
+
+    exts = get_document_extensions_for_service("DOCLING")
+    assert ".pdf" in exts
+    assert ".webp" in exts
+    assert ".docx" in exts
+    assert ".eml" not in exts
+    assert ".docm" not in exts
+    assert ".gif" not in exts
+    assert ".heic" not in exts
+
+
+def test_get_extensions_for_llamacloud():
+    from app.utils.file_extensions import get_document_extensions_for_service
+
+    exts = get_document_extensions_for_service("LLAMACLOUD")
+    assert ".docm" in exts
+    assert ".gif" in exts
+    assert ".svg" in exts
+    assert ".hwp" in exts
+    assert ".eml" not in exts
+    assert ".heic" not in exts
+
+
+def test_get_extensions_for_unstructured():
+    from app.utils.file_extensions import get_document_extensions_for_service
+
+    exts = get_document_extensions_for_service("UNSTRUCTURED")
+    assert ".eml" in exts
+    assert ".heic" in exts
+    assert ".p7s" in exts
+    assert ".docm" not in exts
+    assert ".gif" not in exts
+    assert ".svg" not in exts
+
+
+def test_get_extensions_for_none_returns_union():
+    from app.utils.file_extensions import (
+        DOCUMENT_EXTENSIONS,
+        get_document_extensions_for_service,
+    )
+
+    assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 6b59f8ef6..c8ce195aa 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -85,7 +85,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
 		"application/rtf": [".rtf"],
 		"application/xml": [".xml"],
 		"application/epub+zip": [".epub"],
-		"text/html": [".html", ".htm", ".web"],
 		"image/gif": [".gif"],
 		"image/svg+xml": [".svg"],
 		...audioFileTypes,
@@ -472,12 +471,13 @@ export function DocumentUploadTab({
 						</button>
 					))
 				) : (
-					<div
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer"
-						onClick={() => {
-							if (!isElectron) fileInputRef.current?.click();
-						}}
-					>
+				<button
+					type="button"
+					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
+					onClick={() => {
+						if (!isElectron) fileInputRef.current?.click();
+					}}
+				>
 						<Upload className="h-10 w-10 text-muted-foreground" />
 						<div className="text-center space-y-1.5">
 							<p className="text-base font-medium">
@@ -485,10 +485,11 @@ export function DocumentUploadTab({
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
-						<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}>
-							{renderBrowseButton({ fullWidth: true })}
-						</div>
+					{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
+					<div className="w-full mt-1" onClick={(e) => e.stopPropagation()} onKeyDown={(e) => e.stopPropagation()} role="group">
+						{renderBrowseButton({ fullWidth: true })}
 					</div>
+					</button>
 				)}
 			</div>
 
@@ -683,13 +684,17 @@ export function DocumentUploadTab({
 						</span>
 					</AccordionTrigger>
 					<AccordionContent className="px-3 pb-3">
-						<div className="flex flex-wrap gap-1">
-							{supportedExtensions.map((ext) => (
-								<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0">
-									{ext}
-								</Badge>
-							))}
-						</div>
+					<div className="flex flex-wrap gap-1.5">
+						{supportedExtensions.map((ext) => (
+							<Badge
+								key={ext}
+								variant="secondary"
+								className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
+							>
+								{ext}
+							</Badge>
+						))}
+					</div>
 					</AccordionContent>
 				</AccordionItem>
 			</Accordion>

From 3a1d7008174a43db3c13813a6237427587786ca8 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 03:16:34 +0530
Subject: [PATCH 28/37] refactor: enhance file skipping logic across Dropbox,
 Google Drive, and OneDrive connectors to return unsupported extensions,
 improving error reporting and maintainability

---
 .../app/connectors/dropbox/file_types.py      | 17 +++-
 .../app/connectors/google_drive/file_types.py | 14 ++-
 .../app/connectors/onedrive/file_types.py     | 24 +++--
 .../routes/search_source_connectors_routes.py | 11 ++-
 .../app/services/notification_service.py      | 47 +++++-----
 .../connector_indexers/dropbox_indexer.py     | 93 +++++++++++++------
 .../google_drive_indexer.py                   | 90 ++++++++++++------
 .../connector_indexers/onedrive_indexer.py    | 79 ++++++++++------
 .../test_dropbox_parallel.py                  | 13 ++-
 .../test_google_drive_parallel.py             |  2 +-
 .../connector_indexers/test_page_limits.py    |  4 +-
 .../connectors/test_dropbox_file_types.py     | 44 ++++++---
 .../test_google_drive_file_types.py           | 24 ++++-
 .../connectors/test_onedrive_file_types.py    | 42 +++++++--
 14 files changed, 344 insertions(+), 160 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py
index 13209ffd2..d26306665 100644
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@@ -42,18 +42,25 @@ def is_paper_file(item: dict) -> bool:
     return ext == PAPER_EXTENSION
 
 
-def should_skip_file(item: dict) -> bool:
+def should_skip_file(item: dict) -> tuple[bool, str | None]:
     """Skip folders and truly non-indexable files.
 
     Paper docs are non-downloadable but exportable, so they are NOT skipped.
+    Returns (should_skip, unsupported_extension_or_None).
     """
     if is_folder(item):
-        return True
+        return True, None
     if is_paper_file(item):
-        return False
+        return False, None
     if not item.get("is_downloadable", True):
-        return True
+        return True, None
+
+    from pathlib import PurePosixPath
+
     from app.config import config as app_config
 
     name = item.get("name", "")
-    return should_skip_for_service(name, app_config.ETL_SERVICE)
+    if should_skip_for_service(name, app_config.ETL_SERVICE):
+        ext = PurePosixPath(name).suffix.lower()
+        return True, ext
+    return False, None
diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py
index 73f016ceb..75dc1d4b3 100644
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@@ -48,11 +48,19 @@ def should_skip_file(mime_type: str) -> bool:
     return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
 
 
-def should_skip_by_extension(filename: str) -> bool:
-    """Return True if the file extension is not parseable by the configured ETL service."""
+def should_skip_by_extension(filename: str) -> tuple[bool, str | None]:
+    """Check if the file extension is not parseable by the configured ETL service.
+
+    Returns (should_skip, unsupported_extension_or_None).
+    """
+    from pathlib import PurePosixPath
+
     from app.config import config as app_config
 
-    return should_skip_for_service(filename, app_config.ETL_SERVICE)
+    if should_skip_for_service(filename, app_config.ETL_SERVICE):
+        ext = PurePosixPath(filename).suffix.lower()
+        return True, ext
+    return False, None
 
 
 def get_export_mime_type(mime_type: str) -> str | None:
diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py
index f9c147da8..942b0be73 100644
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@@ -40,18 +40,28 @@ def is_folder(item: dict) -> bool:
     return ONEDRIVE_FOLDER_FACET in item
 
 
-def should_skip_file(item: dict) -> bool:
-    """Skip folders, OneNote files, remote items (shared links), packages, and unsupported extensions."""
+def should_skip_file(item: dict) -> tuple[bool, str | None]:
+    """Skip folders, OneNote files, remote items, packages, and unsupported extensions.
+
+    Returns (should_skip, unsupported_extension_or_None).
+    The second element is only set when the skip is due to an unsupported extension.
+    """
     if is_folder(item):
-        return True
+        return True, None
     if "remoteItem" in item:
-        return True
+        return True, None
     if "package" in item:
-        return True
+        return True, None
     mime = item.get("file", {}).get("mimeType", "")
     if mime in SKIP_MIME_TYPES:
-        return True
+        return True, None
+
+    from pathlib import PurePosixPath
+
     from app.config import config as app_config
 
     name = item.get("name", "")
-    return should_skip_for_service(name, app_config.ETL_SERVICE)
+    if should_skip_for_service(name, app_config.ETL_SERVICE):
+        ext = PurePosixPath(name).suffix.lower()
+        return True, ext
+    return False, None
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index d208ff910..a30eb7297 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -2477,6 +2477,8 @@ async def run_google_drive_indexing(
                 stage="fetching",
             )
 
+        total_unsupported = 0
+
         # Index each folder with indexing options
         for folder in items.folders:
             try:
@@ -2484,6 +2486,7 @@ async def run_google_drive_indexing(
                     indexed_count,
                     skipped_count,
                     error_message,
+                    unsupported_count,
                 ) = await index_google_drive_files(
                     session,
                     connector_id,
@@ -2497,6 +2500,7 @@ async def run_google_drive_indexing(
                     include_subfolders=indexing_options.include_subfolders,
                 )
                 total_skipped += skipped_count
+                total_unsupported += unsupported_count
                 if error_message:
                     errors.append(f"Folder '{folder.name}': {error_message}")
                 else:
@@ -2572,6 +2576,7 @@ async def run_google_drive_indexing(
                 indexed_count=total_indexed,
                 error_message=error_message,
                 skipped_count=total_skipped,
+                unsupported_count=total_unsupported,
             )
 
     except Exception as e:
@@ -2642,7 +2647,7 @@ async def run_onedrive_indexing(
                 stage="fetching",
             )
 
-        total_indexed, total_skipped, error_message = await index_onedrive_files(
+        total_indexed, total_skipped, error_message, total_unsupported = await index_onedrive_files(
             session,
             connector_id,
             search_space_id,
@@ -2683,6 +2688,7 @@ async def run_onedrive_indexing(
                 indexed_count=total_indexed,
                 error_message=error_message,
                 skipped_count=total_skipped,
+                unsupported_count=total_unsupported,
             )
 
     except Exception as e:
@@ -2750,7 +2756,7 @@ async def run_dropbox_indexing(
                 stage="fetching",
             )
 
-        total_indexed, total_skipped, error_message = await index_dropbox_files(
+        total_indexed, total_skipped, error_message, total_unsupported = await index_dropbox_files(
             session,
             connector_id,
             search_space_id,
@@ -2791,6 +2797,7 @@ async def run_dropbox_indexing(
                 indexed_count=total_indexed,
                 error_message=error_message,
                 skipped_count=total_skipped,
+                unsupported_count=total_unsupported,
             )
 
     except Exception as e:
diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py
index 5e40a3b42..5ffee12d7 100644
--- a/surfsense_backend/app/services/notification_service.py
+++ b/surfsense_backend/app/services/notification_service.py
@@ -421,6 +421,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
         error_message: str | None = None,
         is_warning: bool = False,
         skipped_count: int | None = None,
+        unsupported_count: int | None = None,
     ) -> Notification:
         """
         Update notification when connector indexing completes.
@@ -428,10 +429,11 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
         Args:
             session: Database session
             notification: Notification to update
-            indexed_count: Total number of items indexed
+            indexed_count: Total number of files indexed
             error_message: Error message if indexing failed, or warning message (optional)
             is_warning: If True, treat error_message as a warning (success case) rather than an error
-            skipped_count: Number of items skipped (e.g., duplicates) - optional
+            skipped_count: Number of files skipped (e.g., unchanged) - optional
+            unsupported_count: Number of files skipped because the ETL parser doesn't support them
 
         Returns:
             Updated notification
@@ -440,52 +442,45 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
             "connector_name", "Connector"
         )
 
-        # Build the skipped text if there are skipped items
-        skipped_text = ""
-        if skipped_count and skipped_count > 0:
-            skipped_item_text = "item" if skipped_count == 1 else "items"
-            skipped_text = (
-                f" ({skipped_count} {skipped_item_text} skipped - already indexed)"
-            )
+        unsupported_text = ""
+        if unsupported_count and unsupported_count > 0:
+            file_word = "file was" if unsupported_count == 1 else "files were"
+            unsupported_text = f" {unsupported_count} {file_word} not supported."
 
-        # If there's an error message but items were indexed, treat it as a warning (partial success)
-        # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
-        # Otherwise, treat it as a failure
         if error_message:
             if indexed_count > 0:
-                # Partial success with warnings (e.g., duplicate content from other connectors)
                 title = f"Ready: {connector_name}"
-                item_text = "item" if indexed_count == 1 else "items"
-                message = f"Now searchable! {indexed_count} {item_text} synced{skipped_text}. Note: {error_message}"
+                file_text = "file" if indexed_count == 1 else "files"
+                message = f"Now searchable! {indexed_count} {file_text} synced.{unsupported_text} Note: {error_message}"
                 status = "completed"
             elif is_warning:
-                # Warning case (e.g., duplicates found) - treat as success
                 title = f"Ready: {connector_name}"
-                message = f"Sync completed{skipped_text}. {error_message}"
+                message = f"Sync complete.{unsupported_text} {error_message}"
                 status = "completed"
             else:
-                # Complete failure
                 title = f"Failed: {connector_name}"
                 message = f"Sync failed: {error_message}"
+                if unsupported_text:
+                    message += unsupported_text
                 status = "failed"
         else:
             title = f"Ready: {connector_name}"
             if indexed_count == 0:
-                if skipped_count and skipped_count > 0:
-                    skipped_item_text = "item" if skipped_count == 1 else "items"
-                    message = f"Already up to date! {skipped_count} {skipped_item_text} skipped (already indexed)."
+                if unsupported_count and unsupported_count > 0:
+                    message = f"Sync complete.{unsupported_text}"
                 else:
-                    message = "Already up to date! No new items to sync."
+                    message = "Already up to date!"
             else:
-                item_text = "item" if indexed_count == 1 else "items"
-                message = (
-                    f"Now searchable! {indexed_count} {item_text} synced{skipped_text}."
-                )
+                file_text = "file" if indexed_count == 1 else "files"
+                message = f"Now searchable! {indexed_count} {file_text} synced."
+                if unsupported_text:
+                    message += unsupported_text
             status = "completed"
 
         metadata_updates = {
             "indexed_count": indexed_count,
             "skipped_count": skipped_count or 0,
+            "unsupported_count": unsupported_count or 0,
             "sync_stage": "completed"
             if (not error_message or is_warning or indexed_count > 0)
             else "failed",
diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index d116cc264..9e7fe1cfb 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -51,7 +51,10 @@ async def _should_skip_file(
     file_id = file.get("id", "")
     file_name = file.get("name", "Unknown")
 
-    if skip_item(file):
+    skip, unsup_ext = skip_item(file)
+    if skip:
+        if unsup_ext:
+            return True, f"unsupported:{unsup_ext}"
         return True, "folder/non-downloadable"
     if not file_id:
         return True, "missing file_id"
@@ -287,7 +290,7 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int, str]:
+) -> tuple[int, int, int, str]:
     """Delta sync using Dropbox cursor-based change tracking.
 
     Returns (indexed_count, skipped_count, new_cursor).
@@ -309,12 +312,13 @@ async def _index_with_delta_sync(
 
     if not entries:
         logger.info("No changes detected since last sync")
-        return 0, 0, new_cursor or cursor
+        return 0, 0, 0, new_cursor or cursor
 
     logger.info(f"Processing {len(entries)} change entries")
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_to_download: list[dict] = []
     files_processed = 0
 
@@ -339,7 +343,9 @@ async def _index_with_delta_sync(
 
         skip, msg = await _should_skip_file(session, entry, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -360,9 +366,10 @@ async def _index_with_delta_sync(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped, new_cursor or cursor
+    return indexed, skipped, unsupported_count, new_cursor or cursor
 
 
 async def _index_full_scan(
@@ -380,8 +387,11 @@ async def _index_full_scan(
     incremental_sync: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int]:
-    """Full scan indexing of a folder."""
+) -> tuple[int, int, int]:
+    """Full scan indexing of a folder.
+
+    Returns (indexed, skipped, unsupported_count).
+    """
     await task_logger.log_task_progress(
         log_entry,
         f"Starting full scan of folder: {folder_name}",
@@ -401,6 +411,7 @@ async def _index_full_scan(
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_to_download: list[dict] = []
 
     all_files, error = await get_files_in_folder(
@@ -420,14 +431,21 @@ async def _index_full_scan(
         if incremental_sync:
             skip, msg = await _should_skip_file(session, file, search_space_id)
             if skip:
-                if msg and "renamed" in msg.lower():
+                if msg and msg.startswith("unsupported:"):
+                    unsupported_count += 1
+                elif msg and "renamed" in msg.lower():
                     renamed_count += 1
                 else:
                     skipped += 1
                 continue
-        elif skip_item(file):
-            skipped += 1
-            continue
+        else:
+            item_skip, item_unsup = skip_item(file)
+            if item_skip:
+                if item_unsup:
+                    unsupported_count += 1
+                else:
+                    skipped += 1
+                continue
 
         file_pages = PageLimitService.estimate_pages_from_metadata(
             file.get("name", ""), file.get("size")
@@ -466,9 +484,10 @@ async def _index_full_scan(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 
 
 async def _index_selected_files(
@@ -493,6 +512,7 @@ async def _index_selected_files(
     errors: list[str] = []
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
 
     for file_path, file_name in file_paths:
         file, error = await get_file_by_path(dropbox_client, file_path)
@@ -504,14 +524,21 @@ async def _index_selected_files(
         if incremental_sync:
             skip, msg = await _should_skip_file(session, file, search_space_id)
             if skip:
-                if msg and "renamed" in msg.lower():
+                if msg and msg.startswith("unsupported:"):
+                    unsupported_count += 1
+                elif msg and "renamed" in msg.lower():
                     renamed_count += 1
                 else:
                     skipped += 1
                 continue
-        elif skip_item(file):
-            skipped += 1
-            continue
+        else:
+            item_skip, item_unsup = skip_item(file)
+            if item_skip:
+                if item_unsup:
+                    unsupported_count += 1
+                else:
+                    skipped += 1
+                continue
 
         file_pages = PageLimitService.estimate_pages_from_metadata(
             file.get("name", ""), file.get("size")
@@ -543,7 +570,7 @@ async def _index_selected_files(
             user_id, pages_to_deduct, allow_exceed=True
         )
 
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 
 
 async def index_dropbox_files(
@@ -552,7 +579,7 @@ async def index_dropbox_files(
     search_space_id: int,
     user_id: str,
     items_dict: dict,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, str | None, int]:
     """Index Dropbox files for a specific connector.
 
     items_dict format:
@@ -583,7 +610,7 @@ async def index_dropbox_files(
             await task_logger.log_task_failure(
                 log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         token_encrypted = connector.config.get("_token_encrypted", False)
         if token_encrypted and not config.SECRET_KEY:
@@ -594,7 +621,7 @@ async def index_dropbox_files(
                 "Missing SECRET_KEY",
                 {"error_type": "MissingSecretKey"},
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
         dropbox_client = DropboxClient(session, connector_id)
@@ -609,6 +636,7 @@ async def index_dropbox_files(
 
         total_indexed = 0
         total_skipped = 0
+        total_unsupported = 0
 
         selected_files = items_dict.get("files", [])
         if selected_files:
@@ -616,7 +644,7 @@ async def index_dropbox_files(
                 (f.get("path", f.get("path_lower", f.get("id", ""))), f.get("name"))
                 for f in selected_files
             ]
-            indexed, skipped, file_errors = await _index_selected_files(
+            indexed, skipped, unsupported, file_errors = await _index_selected_files(
                 dropbox_client,
                 session,
                 file_tuples,
@@ -628,6 +656,7 @@ async def index_dropbox_files(
             )
             total_indexed += indexed
             total_skipped += skipped
+            total_unsupported += unsupported
             if file_errors:
                 logger.warning(
                     f"File indexing errors for connector {connector_id}: {file_errors}"
@@ -649,7 +678,7 @@ async def index_dropbox_files(
 
             if can_use_delta:
                 logger.info(f"Using delta sync for folder {folder_name}")
-                indexed, skipped, new_cursor = await _index_with_delta_sync(
+                indexed, skipped, unsup, new_cursor = await _index_with_delta_sync(
                     dropbox_client,
                     session,
                     connector_id,
@@ -662,9 +691,10 @@ async def index_dropbox_files(
                     enable_summary=connector_enable_summary,
                 )
                 folder_cursors[folder_path] = new_cursor
+                total_unsupported += unsup
             else:
                 logger.info(f"Using full scan for folder {folder_name}")
-                indexed, skipped = await _index_full_scan(
+                indexed, skipped, unsup = await _index_full_scan(
                     dropbox_client,
                     session,
                     connector_id,
@@ -679,6 +709,7 @@ async def index_dropbox_files(
                     incremental_sync=incremental_sync,
                     enable_summary=connector_enable_summary,
                 )
+                total_unsupported += unsup
 
             total_indexed += indexed
             total_skipped += skipped
@@ -708,12 +739,14 @@ async def index_dropbox_files(
         await task_logger.log_task_success(
             log_entry,
             f"Successfully completed Dropbox indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped},
+            {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported},
         )
         logger.info(
-            f"Dropbox indexing completed: {total_indexed} indexed, {total_skipped} skipped"
+            f"Dropbox indexing completed: {total_indexed} indexed, "
+            f"{total_skipped} skipped, {total_unsupported} unsupported"
         )
-        return total_indexed, total_skipped, None
+
+        return total_indexed, total_skipped, None, total_unsupported
 
     except SQLAlchemyError as db_error:
         await session.rollback()
@@ -724,7 +757,7 @@ async def index_dropbox_files(
             {"error_type": "SQLAlchemyError"},
         )
         logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
     except Exception as e:
         await session.rollback()
         await task_logger.log_task_failure(
@@ -734,4 +767,4 @@ async def index_dropbox_files(
             {"error_type": type(e).__name__},
         )
         logger.error(f"Failed to index Dropbox files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index Dropbox files: {e!s}"
+        return 0, 0, f"Failed to index Dropbox files: {e!s}", 0
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index 9c53092f5..b2afbb9c9 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -81,8 +81,9 @@ async def _should_skip_file(
 
     if skip_mime(mime_type):
         return True, "folder/shortcut"
-    if should_skip_by_extension(file_name):
-        return True, "unsupported extension"
+    ext_skip, unsup_ext = should_skip_by_extension(file_name)
+    if ext_skip:
+        return True, f"unsupported:{unsup_ext}"
     if not file_id:
         return True, "missing file_id"
 
@@ -490,6 +491,7 @@ async def _index_selected_files(
     errors: list[str] = []
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
 
     for file_id, file_name in file_ids:
         file, error = await get_file_by_id(drive_client, file_id)
@@ -500,7 +502,9 @@ async def _index_selected_files(
 
         skip, msg = await _should_skip_file(session, file, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -544,7 +548,7 @@ async def _index_selected_files(
             user_id, pages_to_deduct, allow_exceed=True
         )
 
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 
 
 # ---------------------------------------------------------------------------
@@ -567,8 +571,11 @@ async def _index_full_scan(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int]:
-    """Full scan indexing of a folder."""
+) -> tuple[int, int, int]:
+    """Full scan indexing of a folder.
+
+    Returns (indexed, skipped, unsupported_count).
+    """
     await task_logger.log_task_progress(
         log_entry,
         f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
@@ -590,6 +597,7 @@ async def _index_full_scan(
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_processed = 0
     files_to_download: list[dict] = []
     folders_to_process = [(folder_id, folder_name)]
@@ -630,7 +638,9 @@ async def _index_full_scan(
 
                 skip, msg = await _should_skip_file(session, file, search_space_id)
                 if skip:
-                    if msg and "renamed" in msg.lower():
+                    if msg and msg.startswith("unsupported:"):
+                        unsupported_count += 1
+                    elif msg and "renamed" in msg.lower():
                         renamed_count += 1
                     else:
                         skipped += 1
@@ -703,9 +713,10 @@ async def _index_full_scan(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 
 
 async def _index_with_delta_sync(
@@ -723,8 +734,11 @@ async def _index_with_delta_sync(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int]:
-    """Delta sync using change tracking."""
+) -> tuple[int, int, int]:
+    """Delta sync using change tracking.
+
+    Returns (indexed, skipped, unsupported_count).
+    """
     await task_logger.log_task_progress(
         log_entry,
         f"Starting delta sync from token: {start_page_token[:20]}...",
@@ -759,6 +773,7 @@ async def _index_with_delta_sync(
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_to_download: list[dict] = []
     files_processed = 0
 
@@ -780,7 +795,9 @@ async def _index_with_delta_sync(
 
         skip, msg = await _should_skip_file(session, file, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -837,9 +854,10 @@ async def _index_with_delta_sync(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 
 
 # ---------------------------------------------------------------------------
@@ -859,8 +877,11 @@ async def index_google_drive_files(
     max_files: int = 500,
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, str | None]:
-    """Index Google Drive files for a specific connector."""
+) -> tuple[int, int, str | None, int]:
+    """Index Google Drive files for a specific connector.
+
+    Returns (indexed, skipped, error_or_none, unsupported_count).
+    """
     task_logger = TaskLoggingService(session, search_space_id)
     log_entry = await task_logger.log_task_start(
         task_name="google_drive_files_indexing",
@@ -886,7 +907,7 @@ async def index_google_drive_files(
             await task_logger.log_task_failure(
                 log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         await task_logger.log_task_progress(
             log_entry,
@@ -905,7 +926,7 @@ async def index_google_drive_files(
                     "Missing Composio account",
                     {"error_type": "MissingComposioAccount"},
                 )
-                return 0, 0, error_msg
+                return 0, 0, error_msg, 0
             pre_built_credentials = build_composio_credentials(connected_account_id)
         else:
             token_encrypted = connector.config.get("_token_encrypted", False)
@@ -920,6 +941,7 @@ async def index_google_drive_files(
                     0,
                     0,
                     "SECRET_KEY not configured but credentials are marked as encrypted",
+                    0,
                 )
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
@@ -932,7 +954,7 @@ async def index_google_drive_files(
             await task_logger.log_task_failure(
                 log_entry, error_msg, {"error_type": "MissingParameter"}
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         target_folder_id = folder_id
         target_folder_name = folder_name or "Selected Folder"
@@ -943,9 +965,11 @@ async def index_google_drive_files(
             use_delta_sync and start_page_token and connector.last_indexed_at
         )
 
+        documents_unsupported = 0
+
         if can_use_delta:
             logger.info(f"Using delta sync for connector {connector_id}")
-            documents_indexed, documents_skipped = await _index_with_delta_sync(
+            documents_indexed, documents_skipped, du = await _index_with_delta_sync(
                 drive_client,
                 session,
                 connector,
@@ -961,8 +985,9 @@ async def index_google_drive_files(
                 on_heartbeat_callback,
                 connector_enable_summary,
             )
+            documents_unsupported += du
             logger.info("Running reconciliation scan after delta sync")
-            ri, rs = await _index_full_scan(
+            ri, rs, ru = await _index_full_scan(
                 drive_client,
                 session,
                 connector,
@@ -980,9 +1005,10 @@ async def index_google_drive_files(
             )
             documents_indexed += ri
             documents_skipped += rs
+            documents_unsupported += ru
         else:
             logger.info(f"Using full scan for connector {connector_id}")
-            documents_indexed, documents_skipped = await _index_full_scan(
+            documents_indexed, documents_skipped, documents_unsupported = await _index_full_scan(
                 drive_client,
                 session,
                 connector,
@@ -1017,14 +1043,17 @@ async def index_google_drive_files(
             {
                 "files_processed": documents_indexed,
                 "files_skipped": documents_skipped,
+                "files_unsupported": documents_unsupported,
                 "sync_type": "delta" if can_use_delta else "full",
                 "folder": target_folder_name,
             },
         )
         logger.info(
-            f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped"
+            f"Google Drive indexing completed: {documents_indexed} indexed, "
+            f"{documents_skipped} skipped, {documents_unsupported} unsupported"
         )
-        return documents_indexed, documents_skipped, None
+
+        return documents_indexed, documents_skipped, None, documents_unsupported
 
     except SQLAlchemyError as db_error:
         await session.rollback()
@@ -1035,7 +1064,7 @@ async def index_google_drive_files(
             {"error_type": "SQLAlchemyError"},
         )
         logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
     except Exception as e:
         await session.rollback()
         await task_logger.log_task_failure(
@@ -1045,7 +1074,7 @@ async def index_google_drive_files(
             {"error_type": type(e).__name__},
         )
         logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index Google Drive files: {e!s}"
+        return 0, 0, f"Failed to index Google Drive files: {e!s}", 0
 
 
 async def index_google_drive_single_file(
@@ -1247,7 +1276,7 @@ async def index_google_drive_selected_files(
             session, connector_id, credentials=pre_built_credentials
         )
 
-        indexed, skipped, errors = await _index_selected_files(
+        indexed, skipped, unsupported, errors = await _index_selected_files(
             drive_client,
             session,
             files,
@@ -1258,6 +1287,11 @@ async def index_google_drive_selected_files(
             on_heartbeat=on_heartbeat_callback,
         )
 
+        if unsupported > 0:
+            file_text = "file was" if unsupported == 1 else "files were"
+            unsup_msg = f"{unsupported} {file_text} not supported"
+            errors.append(unsup_msg)
+
         await session.commit()
 
         if errors:
@@ -1265,7 +1299,7 @@ async def index_google_drive_selected_files(
                 log_entry,
                 f"Batch file indexing completed with {len(errors)} error(s)",
                 "; ".join(errors),
-                {"indexed": indexed, "skipped": skipped, "error_count": len(errors)},
+                {"indexed": indexed, "skipped": skipped, "unsupported": unsupported, "error_count": len(errors)},
             )
         else:
             await task_logger.log_task_success(
diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
index 2301b6260..db42773fe 100644
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@@ -56,7 +56,10 @@ async def _should_skip_file(
     file_id = file.get("id")
     file_name = file.get("name", "Unknown")
 
-    if skip_item(file):
+    skip, unsup_ext = skip_item(file)
+    if skip:
+        if unsup_ext:
+            return True, f"unsupported:{unsup_ext}"
         return True, "folder/onenote/remote"
     if not file_id:
         return True, "missing file_id"
@@ -301,6 +304,7 @@ async def _index_selected_files(
     errors: list[str] = []
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
 
     for file_id, file_name in file_ids:
         file, error = await get_file_by_id(onedrive_client, file_id)
@@ -311,7 +315,9 @@ async def _index_selected_files(
 
         skip, msg = await _should_skip_file(session, file, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -347,7 +353,7 @@ async def _index_selected_files(
             user_id, pages_to_deduct, allow_exceed=True
         )
 
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 
 
 # ---------------------------------------------------------------------------
@@ -369,8 +375,11 @@ async def _index_full_scan(
     include_subfolders: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int]:
-    """Full scan indexing of a folder."""
+) -> tuple[int, int, int]:
+    """Full scan indexing of a folder.
+
+    Returns (indexed, skipped, unsupported_count).
+    """
     await task_logger.log_task_progress(
         log_entry,
         f"Starting full scan of folder: {folder_name}",
@@ -389,6 +398,7 @@ async def _index_full_scan(
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_to_download: list[dict] = []
 
     all_files, error = await get_files_in_folder(
@@ -407,7 +417,9 @@ async def _index_full_scan(
     for file in all_files[:max_files]:
         skip, msg = await _should_skip_file(session, file, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -450,9 +462,10 @@ async def _index_full_scan(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 
 
 async def _index_with_delta_sync(
@@ -468,8 +481,11 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
-) -> tuple[int, int, str | None]:
-    """Delta sync using OneDrive change tracking. Returns (indexed, skipped, new_delta_link)."""
+) -> tuple[int, int, int, str | None]:
+    """Delta sync using OneDrive change tracking.
+
+    Returns (indexed, skipped, unsupported_count, new_delta_link).
+    """
     await task_logger.log_task_progress(
         log_entry,
         "Starting delta sync",
@@ -489,7 +505,7 @@ async def _index_with_delta_sync(
 
     if not changes:
         logger.info("No changes detected since last sync")
-        return 0, 0, new_delta_link
+        return 0, 0, 0, new_delta_link
 
     logger.info(f"Processing {len(changes)} delta changes")
 
@@ -501,6 +517,7 @@ async def _index_with_delta_sync(
 
     renamed_count = 0
     skipped = 0
+    unsupported_count = 0
     files_to_download: list[dict] = []
     files_processed = 0
 
@@ -523,7 +540,9 @@ async def _index_with_delta_sync(
 
         skip, msg = await _should_skip_file(session, change, search_space_id)
         if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
+                unsupported_count += 1
+            elif msg and "renamed" in msg.lower():
                 renamed_count += 1
             else:
                 skipped += 1
@@ -566,9 +585,10 @@ async def _index_with_delta_sync(
 
     indexed = renamed_count + batch_indexed
     logger.info(
-        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
+        f"{unsupported_count} unsupported, {failed} failed"
     )
-    return indexed, skipped, new_delta_link
+    return indexed, skipped, unsupported_count, new_delta_link
 
 
 # ---------------------------------------------------------------------------
@@ -582,7 +602,7 @@ async def index_onedrive_files(
     search_space_id: int,
     user_id: str,
     items_dict: dict,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, str | None, int]:
     """Index OneDrive files for a specific connector.
 
     items_dict format:
@@ -609,7 +629,7 @@ async def index_onedrive_files(
             await task_logger.log_task_failure(
                 log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         token_encrypted = connector.config.get("_token_encrypted", False)
         if token_encrypted and not config.SECRET_KEY:
@@ -620,7 +640,7 @@ async def index_onedrive_files(
                 "Missing SECRET_KEY",
                 {"error_type": "MissingSecretKey"},
             )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
         onedrive_client = OneDriveClient(session, connector_id)
@@ -632,12 +652,13 @@ async def index_onedrive_files(
 
         total_indexed = 0
         total_skipped = 0
+        total_unsupported = 0
 
         # Index selected individual files
         selected_files = items_dict.get("files", [])
         if selected_files:
             file_tuples = [(f["id"], f.get("name")) for f in selected_files]
-            indexed, skipped, _errors = await _index_selected_files(
+            indexed, skipped, unsupported, _errors = await _index_selected_files(
                 onedrive_client,
                 session,
                 file_tuples,
@@ -648,6 +669,7 @@ async def index_onedrive_files(
             )
             total_indexed += indexed
             total_skipped += skipped
+            total_unsupported += unsupported
 
         # Index selected folders
         folders = items_dict.get("folders", [])
@@ -661,7 +683,7 @@ async def index_onedrive_files(
 
             if can_use_delta:
                 logger.info(f"Using delta sync for folder {folder_name}")
-                indexed, skipped, new_delta_link = await _index_with_delta_sync(
+                indexed, skipped, unsup, new_delta_link = await _index_with_delta_sync(
                     onedrive_client,
                     session,
                     connector_id,
@@ -676,6 +698,7 @@ async def index_onedrive_files(
                 )
                 total_indexed += indexed
                 total_skipped += skipped
+                total_unsupported += unsup
 
                 if new_delta_link:
                     await session.refresh(connector)
@@ -685,7 +708,7 @@ async def index_onedrive_files(
                     flag_modified(connector, "config")
 
                 # Reconciliation full scan
-                ri, rs = await _index_full_scan(
+                ri, rs, ru = await _index_full_scan(
                     onedrive_client,
                     session,
                     connector_id,
@@ -701,9 +724,10 @@ async def index_onedrive_files(
                 )
                 total_indexed += ri
                 total_skipped += rs
+                total_unsupported += ru
             else:
                 logger.info(f"Using full scan for folder {folder_name}")
-                indexed, skipped = await _index_full_scan(
+                indexed, skipped, unsup = await _index_full_scan(
                     onedrive_client,
                     session,
                     connector_id,
@@ -719,6 +743,7 @@ async def index_onedrive_files(
                 )
                 total_indexed += indexed
                 total_skipped += skipped
+                total_unsupported += unsup
 
             # Store new delta link for this folder
             _, new_delta_link, _ = await onedrive_client.get_delta(folder_id=folder_id)
@@ -737,12 +762,14 @@ async def index_onedrive_files(
         await task_logger.log_task_success(
             log_entry,
             f"Successfully completed OneDrive indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped},
+            {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported},
         )
         logger.info(
-            f"OneDrive indexing completed: {total_indexed} indexed, {total_skipped} skipped"
+            f"OneDrive indexing completed: {total_indexed} indexed, "
+            f"{total_skipped} skipped, {total_unsupported} unsupported"
         )
-        return total_indexed, total_skipped, None
+
+        return total_indexed, total_skipped, None, total_unsupported
 
     except SQLAlchemyError as db_error:
         await session.rollback()
@@ -753,7 +780,7 @@ async def index_onedrive_files(
             {"error_type": "SQLAlchemyError"},
         )
         logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
     except Exception as e:
         await session.rollback()
         await task_logger.log_task_failure(
@@ -763,4 +790,4 @@ async def index_onedrive_files(
             {"error_type": type(e).__name__},
         )
         logger.error(f"Failed to index OneDrive files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index OneDrive files: {e!s}"
+        return 0, 0, f"Failed to index OneDrive files: {e!s}", 0
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index 8572fa8ea..14c16fce4 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -265,7 +265,10 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
 
     async def _fake_skip(session, file, search_space_id):
         from app.connectors.dropbox.file_types import should_skip_file as _skip
-        if _skip(file):
+        item_skip, unsup_ext = _skip(file)
+        if item_skip:
+            if unsup_ext:
+                return True, f"unsupported:{unsup_ext}"
             return True, "folder/non-downloadable"
         return skip_results.get(file.get("id", ""), (False, None))
 
@@ -541,7 +544,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, cursor = await _index_with_delta_sync(
+    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -578,7 +581,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, cursor = await _index_with_delta_sync(
+    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -628,7 +631,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, cursor = await _index_with_delta_sync(
+    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -662,7 +665,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, cursor = await _index_with_delta_sync(
+    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
index 20bd3f3d6..b830e9773 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped = await _index_with_delta_sync(
+    indexed, skipped, unsupported = await _index_with_delta_sync(
         MagicMock(),
         mock_session,
         MagicMock(),
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
index b31a9557f..e2996ce9d 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
@@ -384,7 +384,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp
     m["download_mock"].return_value = ([], 0)
     m["batch_mock"].return_value = ([], 2, 0)
 
-    _indexed, skipped = await _run_gdrive_full_scan(m)
+    _indexed, skipped, _unsup = await _run_gdrive_full_scan(m)
 
     call_files = m["download_mock"].call_args[0][1]
     assert len(call_files) == 2
@@ -459,7 +459,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    _indexed, skipped = await _mod._index_with_delta_sync(
+    _indexed, skipped, _unsupported = await _mod._index_with_delta_sync(
         MagicMock(),
         session,
         MagicMock(),
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
index e092872c5..74277d47c 100644
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@@ -14,17 +14,23 @@ pytestmark = pytest.mark.unit
 
 def test_folder_item_is_skipped():
     item = {".tag": "folder", "name": "My Folder"}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 def test_paper_file_is_not_skipped():
     item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
-    assert should_skip_file(item) is False
+    skip, ext = should_skip_file(item)
+    assert skip is False
+    assert ext is None
 
 
 def test_non_downloadable_item_is_skipped():
     item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 # ---------------------------------------------------------------------------
@@ -49,7 +55,9 @@ def test_non_downloadable_item_is_skipped():
 def test_non_parseable_extensions_are_skipped(filename, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
     item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is True, f"{filename} should be skipped"
+    skip, ext = should_skip_file(item)
+    assert skip is True, f"{filename} should be skipped"
+    assert ext is not None
 
 
 @pytest.mark.parametrize(
@@ -65,9 +73,9 @@ def test_parseable_documents_are_not_skipped(filename, mocker):
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
         item = {".tag": "file", "name": filename}
-        assert should_skip_file(item) is False, (
-            f"{filename} should NOT be skipped with {service}"
-        )
+        skip, ext = should_skip_file(item)
+        assert skip is False, f"{filename} should NOT be skipped with {service}"
+        assert ext is None
 
 
 @pytest.mark.parametrize(
@@ -79,9 +87,9 @@ def test_universal_images_are_not_skipped(filename, mocker):
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
         item = {".tag": "file", "name": filename}
-        assert should_skip_file(item) is False, (
-            f"{filename} should NOT be skipped with {service}"
-        )
+        skip, ext = should_skip_file(item)
+        assert skip is False, f"{filename} should NOT be skipped with {service}"
+        assert ext is None
 
 
 @pytest.mark.parametrize("filename,service,expected_skip", [
@@ -111,6 +119,20 @@ def test_universal_images_are_not_skipped(filename, mocker):
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {".tag": "file", "name": filename}
-    assert should_skip_file(item) is expected_skip, (
+    skip, ext = should_skip_file(item)
+    assert skip is expected_skip, (
         f"{filename} with {service}: expected skip={expected_skip}"
     )
+    if expected_skip:
+        assert ext is not None
+    else:
+        assert ext is None
+
+
+def test_returns_unsupported_extension(mocker):
+    """When a file is skipped due to unsupported extension, the ext string is returned."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+    item = {".tag": "file", "name": "old.doc"}
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext == ".doc"
diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
index 4ed7eb4db..5cd43736b 100644
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@@ -14,7 +14,8 @@ def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mock
     """Truly unsupported files are skipped no matter which ETL service is configured."""
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
-        assert should_skip_by_extension(filename) is True
+        skip, ext = should_skip_by_extension(filename)
+        assert skip is True
 
 
 @pytest.mark.parametrize("filename", [
@@ -25,9 +26,9 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
     """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
-        assert should_skip_by_extension(filename) is False, (
-            f"{filename} should NOT be skipped with {service}"
-        )
+        skip, ext = should_skip_by_extension(filename)
+        assert skip is False, f"{filename} should NOT be skipped with {service}"
+        assert ext is None
 
 
 @pytest.mark.parametrize("filename,service,expected_skip", [
@@ -42,6 +43,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
 ])
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
-    assert should_skip_by_extension(filename) is expected_skip, (
+    skip, ext = should_skip_by_extension(filename)
+    assert skip is expected_skip, (
         f"{filename} with {service}: expected skip={expected_skip}"
     )
+    if expected_skip:
+        assert ext is not None, "unsupported extension should be returned"
+    else:
+        assert ext is None
+
+
+def test_returns_unsupported_extension(mocker):
+    """When a file is skipped, the unsupported extension string is returned."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+    skip, ext = should_skip_by_extension("macro.docm")
+    assert skip is True
+    assert ext == ".docm"
diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
index e73f799e2..61212b340 100644
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@@ -14,22 +14,30 @@ pytestmark = pytest.mark.unit
 
 def test_folder_is_skipped():
     item = {"folder": {}, "name": "My Folder"}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 def test_remote_item_is_skipped():
     item = {"remoteItem": {}, "name": "shared.docx"}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 def test_package_is_skipped():
     item = {"package": {}, "name": "notebook"}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 def test_onenote_is_skipped():
     item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
-    assert should_skip_file(item) is True
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext is None
 
 
 # ---------------------------------------------------------------------------
@@ -43,7 +51,9 @@ def test_onenote_is_skipped():
 def test_unsupported_extensions_are_skipped(filename, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
-    assert should_skip_file(item) is True, f"{filename} should be skipped"
+    skip, ext = should_skip_file(item)
+    assert skip is True, f"{filename} should be skipped"
+    assert ext is not None
 
 
 @pytest.mark.parametrize("filename", [
@@ -54,9 +64,9 @@ def test_universal_files_are_not_skipped(filename, mocker):
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
         item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
-        assert should_skip_file(item) is False, (
-            f"{filename} should NOT be skipped with {service}"
-        )
+        skip, ext = should_skip_file(item)
+        assert skip is False, f"{filename} should NOT be skipped with {service}"
+        assert ext is None
 
 
 @pytest.mark.parametrize("filename,service,expected_skip", [
@@ -70,6 +80,20 @@ def test_universal_files_are_not_skipped(filename, mocker):
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
-    assert should_skip_file(item) is expected_skip, (
+    skip, ext = should_skip_file(item)
+    assert skip is expected_skip, (
         f"{filename} with {service}: expected skip={expected_skip}"
     )
+    if expected_skip:
+        assert ext is not None
+    else:
+        assert ext is None
+
+
+def test_returns_unsupported_extension(mocker):
+    """When a file is skipped due to unsupported extension, the ext string is returned."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+    item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
+    skip, ext = should_skip_file(item)
+    assert skip is True
+    assert ext == ".eml"

From 122be76133fbd5bfdc504464dff5f2cab69e6e5e Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 03:16:46 +0530
Subject: [PATCH 29/37] refactor: update _index_selected_files method
 signatures in Dropbox, Google Drive, and OneDrive indexers to include
 unsupported file count, enhancing error reporting and consistency across
 connectors

---
 .../app/tasks/connector_indexers/dropbox_indexer.py  |  2 +-
 .../tasks/connector_indexers/google_drive_indexer.py |  4 ++--
 .../app/tasks/connector_indexers/onedrive_indexer.py |  2 +-
 .../connector_indexers/test_google_drive_parallel.py |  8 ++++----
 .../unit/connector_indexers/test_page_limits.py      | 12 ++++++------
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index 9e7fe1cfb..8d2a45e03 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -501,7 +501,7 @@ async def _index_selected_files(
     enable_summary: bool,
     incremental_sync: bool = True,
     on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
     pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index b2afbb9c9..0ded89102 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -474,13 +474,13 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline.
 
     Phase 1 (serial): fetch metadata + skip checks.
     Phase 2+3 (parallel): download, ETL, index via _download_and_index.
 
-    Returns (indexed_count, skipped_count, errors).
+    Returns (indexed_count, skipped_count, unsupported_count, errors).
     """
     page_limit_service = PageLimitService(session)
     pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
index db42773fe..b26442490 100644
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@@ -293,7 +293,7 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
     pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
index b830e9773..7fa92ce12 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
     full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
     full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
 
-    indexed, skipped = await _run_full_scan(full_scan_mocks)
+    indexed, skipped, unsupported = await _run_full_scan(full_scan_mocks)
 
     assert indexed == 3  # 1 renamed + 2 from batch
     assert skipped == 1  # 1 unchanged
@@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, unsup, errors = await _run_selected(
         selected_files_mocks,
         [("f1", "report.pdf")],
     )
@@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, unsup, errors = await _run_selected(
         selected_files_mocks,
         [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
     )
@@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
 
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, unsup, errors = await _run_selected(
         selected_files_mocks,
         [
             ("s1", "unchanged.txt"),
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
index e2996ce9d..58737b20b 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
@@ -198,7 +198,7 @@ async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks):
         )
     m["download_and_index_mock"].return_value = (3, 0)
 
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
         m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
     )
 
@@ -219,7 +219,7 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
         None,
     )
 
-    indexed, _skipped, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
 
     assert indexed == 0
     assert len(errors) == 1
@@ -239,7 +239,7 @@ async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks):
         )
     m["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
         m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
     )
 
@@ -299,7 +299,7 @@ async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks):
             None,
         )
 
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
         m, [("f1", "f1.xyz"), ("f2", "f2.xyz")]
     )
 
@@ -552,7 +552,7 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
         None,
     )
 
-    indexed, _skipped, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
 
     assert indexed == 0
     assert len(errors) == 1
@@ -652,7 +652,7 @@ async def test_dropbox_over_quota_rejected(dropbox_selected_mocks):
         None,
     )
 
-    indexed, _skipped, errors = await _run_dropbox_selected(
+    indexed, _skipped, _unsup, errors = await _run_dropbox_selected(
         m, [("/huge.pdf", "huge.pdf")]
     )
 

From a624c86b04ff83336ae5d37cac3cf1095d63dcea Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:11:15 +0530
Subject: [PATCH 30/37] refactor: update file skipping logic in Dropbox, Google
 Drive, and OneDrive connectors to return unsupported extension information

---
 .../app/connectors/dropbox/content_extractor.py             | 3 ++-
 .../app/connectors/google_drive/content_extractor.py        | 6 ++++--
 .../app/connectors/onedrive/content_extractor.py            | 3 ++-
 .../google_unification/test_drive_indexer_credentials.py    | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py
index 8e947eee7..8cbc3e417 100644
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@@ -53,7 +53,8 @@ async def download_and_extract_content(
     file_name = file.get("name", "Unknown")
     file_id = file.get("id", "")
 
-    if should_skip_file(file):
+    skip, _unsup_ext = should_skip_file(file)
+    if skip:
         return None, {}, "Skipping non-indexable item"
 
     logger.info(f"Downloading file for content extraction: {file_name}")
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 10f008594..9f49d491d 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -43,7 +43,8 @@ async def download_and_extract_content(
     if should_skip_file(mime_type):
         return None, {}, f"Skipping {mime_type}"
 
-    if should_skip_by_extension(file_name):
+    ext_skip, _unsup_ext = should_skip_by_extension(file_name)
+    if ext_skip:
         return None, {}, f"Skipping unsupported extension: {file_name}"
 
     logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
@@ -155,7 +156,8 @@ async def download_and_process_file(
     if should_skip_file(mime_type):
         return None, f"Skipping {mime_type}", None
 
-    if should_skip_by_extension(file_name):
+    ext_skip, _unsup_ext = should_skip_by_extension(file_name)
+    if ext_skip:
         return None, f"Skipping unsupported extension: {file_name}", None
 
     logger.info(f"Downloading file: {file_name} ({mime_type})")
diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py
index 2355993eb..2238b8603 100644
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@@ -24,7 +24,8 @@ async def download_and_extract_content(
     item_id = file.get("id")
     file_name = file.get("name", "Unknown")
 
-    if should_skip_file(file):
+    skip, _unsup_ext = should_skip_file(file)
+    if skip:
         return None, {}, "Skipping non-indexable item"
 
     file_info = file.get("file", {})
diff --git a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
index 5bb0b6137..e669fa143 100644
--- a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
+++ b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
@@ -124,7 +124,7 @@ async def test_composio_connector_without_account_id_returns_error(
 
     maker = make_session_factory(async_engine)
     async with maker() as session:
-        count, _skipped, error = await index_google_drive_files(
+        count, _skipped, error, _unsupported = await index_google_drive_files(
             session=session,
             connector_id=data["connector_id"],
             search_space_id=data["search_space_id"],

From aba5f6a124bebb75039db1482c4727aabb24360a Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:19:23 +0530
Subject: [PATCH 31/37] refactor: improve file handling logic in Dropbox and
 OneDrive connectors to include unsupported file extension information

---
 .../app/connectors/dropbox/folder_manager.py     |  6 ++++--
 .../app/connectors/onedrive/folder_manager.py    |  6 ++++--
 .../connector_indexers/test_dropbox_parallel.py  | 16 ++++++++--------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/folder_manager.py b/surfsense_backend/app/connectors/dropbox/folder_manager.py
index 5453c8785..f9aa78873 100644
--- a/surfsense_backend/app/connectors/dropbox/folder_manager.py
+++ b/surfsense_backend/app/connectors/dropbox/folder_manager.py
@@ -64,8 +64,10 @@ async def get_files_in_folder(
                         )
                         continue
                     files.extend(sub_files)
-            elif not should_skip_file(item):
-                files.append(item)
+            else:
+                skip, _unsup_ext = should_skip_file(item)
+                if not skip:
+                    files.append(item)
 
         return files, None
 
diff --git a/surfsense_backend/app/connectors/onedrive/folder_manager.py b/surfsense_backend/app/connectors/onedrive/folder_manager.py
index 6fa725ca1..a5d7fa713 100644
--- a/surfsense_backend/app/connectors/onedrive/folder_manager.py
+++ b/surfsense_backend/app/connectors/onedrive/folder_manager.py
@@ -71,8 +71,10 @@ async def get_files_in_folder(
                         )
                         continue
                     files.extend(sub_files)
-            elif not should_skip_file(item):
-                files.append(item)
+            else:
+                skip, _unsup_ext = should_skip_file(item)
+                if not skip:
+                    files.append(item)
 
         return files, None
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index 14c16fce4..adac90085 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -345,7 +345,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
 
     full_scan_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped = await _run_full_scan(
+    indexed, skipped, _unsupported = await _run_full_scan(
         full_scan_mocks, monkeypatch, page_files
     )
 
@@ -444,7 +444,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsupported, errors = await _run_selected(
         selected_files_mocks,
         [("/report.pdf", "report.pdf")],
     )
@@ -466,7 +466,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsupported, errors = await _run_selected(
         selected_files_mocks,
         [("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")],
     )
@@ -496,7 +496,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsupported, errors = await _run_selected(
         selected_files_mocks,
         [
             ("/unchanged.txt", "unchanged.txt"),
@@ -715,10 +715,10 @@ def orchestrator_mocks(monkeypatch):
 
     monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock())
 
-    full_scan_mock = AsyncMock(return_value=(5, 2))
+    full_scan_mock = AsyncMock(return_value=(5, 2, 0))
     monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock)
 
-    delta_sync_mock = AsyncMock(return_value=(3, 1, "delta-cursor-new"))
+    delta_sync_mock = AsyncMock(return_value=(3, 1, 0, "delta-cursor-new"))
     monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock)
 
     mock_client = MagicMock()
@@ -751,7 +751,7 @@ async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
     mock_session = AsyncMock()
     mock_session.commit = AsyncMock()
 
-    indexed, skipped, error = await index_dropbox_files(
+    indexed, skipped, error, _unsupported = await index_dropbox_files(
         mock_session,
         _CONNECTOR_ID,
         _SEARCH_SPACE_ID,
@@ -779,7 +779,7 @@ async def test_orchestrator_falls_back_to_full_scan_without_cursor(
     mock_session = AsyncMock()
     mock_session.commit = AsyncMock()
 
-    indexed, skipped, error = await index_dropbox_files(
+    indexed, skipped, error, _unsupported = await index_dropbox_files(
         mock_session,
         _CONNECTOR_ID,
         _SEARCH_SPACE_ID,

From e4462292e456d38abca0c533b925b4d8a9bee24e Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:30:10 +0530
Subject: [PATCH 32/37] refactor: update Google Drive indexer to return an
 additional unsupported file count, enhancing error reporting consistency

---
 .../tasks/connector_indexers/google_drive_indexer.py   |  2 +-
 .../connector-configs/views/connector-edit-view.tsx    | 10 ++--------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index 0ded89102..a33859af5 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -758,7 +758,7 @@ async def _index_with_delta_sync(
 
     if not changes:
         logger.info("No changes detected since last sync")
-        return 0, 0
+        return 0, 0, 0
 
     logger.info(f"Processing {len(changes)} changes")
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 1e71b0a25..06fc2b497 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -417,14 +417,8 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 						disabled={isSaving || isDisconnecting}
 						className="text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
 					>
-						{isSaving ? (
-							<>
-								<Spinner size="sm" className="mr-2" />
-								Saving
-							</>
-						) : (
-							"Save Changes"
-						)}
+						{isSaving && <Spinner size="sm" className="mr-2" />}
+						Save Changes
 					</Button>
 				)}
 			</div>

From 1b87719a927f5ecac96b254bdcf68d8cc2af38d1 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:36:29 +0530
Subject: [PATCH 33/37] refactor: enhance file skipping logic in Google Drive
 connector to check for Google Workspace files before unsupported extensions

---
 .../connectors/google_drive/content_extractor.py   | 14 ++++++++------
 .../connector_indexers/google_drive_indexer.py     |  8 +++++---
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 9f49d491d..3e0bb39e5 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -43,9 +43,10 @@ async def download_and_extract_content(
     if should_skip_file(mime_type):
         return None, {}, f"Skipping {mime_type}"
 
-    ext_skip, _unsup_ext = should_skip_by_extension(file_name)
-    if ext_skip:
-        return None, {}, f"Skipping unsupported extension: {file_name}"
+    if not is_google_workspace_file(mime_type):
+        ext_skip, _unsup_ext = should_skip_by_extension(file_name)
+        if ext_skip:
+            return None, {}, f"Skipping unsupported extension: {file_name}"
 
     logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
 
@@ -156,9 +157,10 @@ async def download_and_process_file(
     if should_skip_file(mime_type):
         return None, f"Skipping {mime_type}", None
 
-    ext_skip, _unsup_ext = should_skip_by_extension(file_name)
-    if ext_skip:
-        return None, f"Skipping unsupported extension: {file_name}", None
+    if not is_google_workspace_file(mime_type):
+        ext_skip, _unsup_ext = should_skip_by_extension(file_name)
+        if ext_skip:
+            return None, f"Skipping unsupported extension: {file_name}", None
 
     logger.info(f"Downloading file: {file_name} ({mime_type})")
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index a33859af5..9916e70a0 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -26,6 +26,7 @@ from app.connectors.google_drive import (
     get_start_page_token,
 )
 from app.connectors.google_drive.file_types import (
+    is_google_workspace_file,
     should_skip_by_extension,
     should_skip_file as skip_mime,
 )
@@ -81,9 +82,10 @@ async def _should_skip_file(
 
     if skip_mime(mime_type):
         return True, "folder/shortcut"
-    ext_skip, unsup_ext = should_skip_by_extension(file_name)
-    if ext_skip:
-        return True, f"unsupported:{unsup_ext}"
+    if not is_google_workspace_file(mime_type):
+        ext_skip, unsup_ext = should_skip_by_extension(file_name)
+        if ext_skip:
+            return True, f"unsupported:{unsup_ext}"
     if not file_id:
         return True, "missing file_id"
 

From 7f32dd068f0b4d9c2f3bad651d71e7420132495d Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:40:40 +0530
Subject: [PATCH 34/37] refactor: update button rendering logic in connector
 views to improve loading state handling

---
 .../views/connector-connect-view.tsx           | 18 +++++++-----------
 .../views/connector-edit-view.tsx              | 18 ++++++------------
 .../views/connector-accounts-list-view.tsx     |  2 +-
 .../views/youtube-crawler-view.tsx             | 14 +++++---------
 4 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
index 596b98e93..8a0ef5ae1 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
@@ -144,18 +144,14 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 					type="button"
 					onClick={handleFormSubmit}
 					disabled={isSubmitting}
-					className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
+					className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
 				>
-					{isSubmitting ? (
-						<>
-							<Spinner size="sm" className="mr-2" />
-							Connecting
-						</>
-					) : connectorType === "MCP_CONNECTOR" ? (
-						"Connect"
-					) : (
-						`Connect ${getConnectorTypeDisplay(connectorType)}`
-					)}
+					<span className={isSubmitting ? "opacity-0" : ""}>
+						{connectorType === "MCP_CONNECTOR"
+							? "Connect"
+							: `Connect ${getConnectorTypeDisplay(connectorType)}`}
+					</span>
+					{isSubmitting && <Spinner size="sm" className="absolute" />}
 				</Button>
 			</div>
 		</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 06fc2b497..7308e1e26 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -369,16 +369,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 								size="sm"
 								onClick={handleDisconnectConfirm}
 								disabled={isDisconnecting}
-								className="text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
+								className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
 							>
-								{isDisconnecting ? (
-									<>
-										<Spinner size="sm" className="mr-2" />
-										Disconnecting
-									</>
-								) : (
-									"Confirm Disconnect"
-								)}
+								<span className={isDisconnecting ? "opacity-0" : ""}>Confirm Disconnect</span>
+								{isDisconnecting && <Spinner size="sm" className="absolute" />}
 							</Button>
 							<Button
 								variant="ghost"
@@ -415,10 +409,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 					<Button
 						onClick={onSave}
 						disabled={isSaving || isDisconnecting}
-						className="text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
+						className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
 					>
-						{isSaving && <Spinner size="sm" className="mr-2" />}
-						Save Changes
+						<span className={isSaving ? "opacity-0" : ""}>Save Changes</span>
+						{isSaving && <Spinner size="sm" className="absolute" />}
 					</Button>
 				)}
 			</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
index 5dfc252c2..5e2b8452b 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
@@ -174,7 +174,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 							)}
 						</div>
 						<span className="text-xs sm:text-sm font-medium">
-							{isConnecting ? "Connecting" : buttonText}
+							{buttonText}
 						</span>
 					</button>
 				</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
index 7ec85f4d3..c16072bca 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
@@ -335,16 +335,12 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 				<Button
 					onClick={handleSubmit}
 					disabled={isSubmitting || isFetchingPlaylist || videoTags.length === 0}
-					className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
+					className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
 				>
-					{isSubmitting ? (
-						<>
-							<Spinner size="sm" className="mr-2" />
-							{t("processing")}
-						</>
-					) : (
-						t("submit")
-					)}
+					<span className={isSubmitting ? "opacity-0" : ""}>
+						{t("submit")}
+					</span>
+					{isSubmitting && <Spinner size="sm" className="absolute" />}
 				</Button>
 			</div>
 		</div>

From 5803fe79dac41715be5b6a961cd978d0998a9069 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:43:34 +0530
Subject: [PATCH 35/37] refactor: update filename handling in Google Drive
 connector to include Google Workspace file extensions, improving content
 extraction accuracy

---
 .../app/connectors/google_drive/content_extractor.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 3e0bb39e5..025c3831a 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -100,7 +100,8 @@ async def download_and_extract_content(
             if error:
                 return None, drive_metadata, error
 
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name
+        markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
         return markdown, drive_metadata, None
 
     except Exception as e:
@@ -232,10 +233,11 @@ async def download_and_process_file(
                 "."
             )[-1]
 
+        etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name
         logger.info(f"Processing {file_name} with Surfsense's file processor")
         await process_file_in_background(
             file_path=temp_file_path,
-            filename=file_name,
+            filename=etl_filename,
             search_space_id=search_space_id,
             user_id=user_id,
             session=session,

From 0a26a6c5bb900d5e660b09a88665242c50c31f1f Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:55:39 +0530
Subject: [PATCH 36/37] chore: ran linting

---
 .../app/connectors/dropbox/client.py          |  14 +-
 .../google_drive/content_extractor.py         |   8 +-
 .../app/etl_pipeline/etl_pipeline_service.py  |   9 +-
 .../app/etl_pipeline/file_classifier.py       | 105 ++++++++++---
 .../app/etl_pipeline/parsers/llamacloud.py    |  12 +-
 .../app/routes/autocomplete_routes.py         |   9 +-
 .../routes/search_source_connectors_routes.py |  14 +-
 .../services/vision_autocomplete_service.py   |  65 +++++---
 .../connector_indexers/dropbox_indexer.py     |  17 +--
 .../google_drive_indexer.py                   |  13 +-
 .../local_folder_indexer.py                   |   3 -
 .../connector_indexers/onedrive_indexer.py    |   6 +-
 .../document_processors/file_processors.py    |  12 +-
 .../app/utils/file_extensions.py              |  97 +++++++++---
 .../test_content_extraction.py                |  20 +--
 .../test_dropbox_parallel.py                  |  59 +++++--
 .../test_google_drive_parallel.py             |  10 +-
 .../connector_indexers/test_page_limits.py    |   8 +-
 .../unit/connectors/test_dropbox_client.py    |  10 +-
 .../connectors/test_dropbox_file_types.py     | 105 ++++++++-----
 .../test_google_drive_file_types.py           |  55 ++++---
 .../connectors/test_onedrive_file_types.py    |  49 ++++--
 .../tests/unit/etl_pipeline/conftest.py       |   4 +-
 .../etl_pipeline/test_etl_pipeline_service.py | 113 ++++++++------
 .../services/test_docling_image_support.py    |  41 ++---
 .../tests/unit/utils/test_file_extensions.py  |  84 +++++++---
 .../components/DesktopContent.tsx             |   8 +-
 .../app/desktop/permissions/page.tsx          |  15 +-
 .../app/desktop/suggestion/layout.tsx         |   6 +-
 surfsense_web/app/desktop/suggestion/page.tsx |  50 +++---
 .../app/desktop/suggestion/suggestion.css     | 144 +++++++++---------
 .../views/connector-accounts-list-view.tsx    |   4 +-
 .../views/youtube-crawler-view.tsx            |   4 +-
 .../assistant-ui/document-upload-popup.tsx    |   4 +-
 .../components/assistant-ui/image.tsx         | 134 ++++++++--------
 .../components/assistant-ui/thread-list.tsx   |   4 +-
 .../components/assistant-ui/tool-fallback.tsx |   3 +-
 .../comment-composer/comment-composer.tsx     |  24 ++-
 .../components/documents/DocumentNode.tsx     |  20 +--
 .../components/documents/FolderTreeView.tsx   |  15 +-
 .../components/editor/plate-editor.tsx        |  19 +--
 .../components/homepage/use-cases-grid.tsx    |  16 +-
 .../layout/providers/LayoutDataProvider.tsx   |   3 +-
 .../ui/sidebar/AllPrivateChatsSidebar.tsx     |  36 ++---
 .../ui/sidebar/AllSharedChatsSidebar.tsx      |  36 ++---
 surfsense_web/components/markdown-viewer.tsx  |  51 ++++---
 .../settings/user-settings-dialog.tsx         |   2 +-
 .../components/sources/DocumentUploadTab.tsx  |  49 +++---
 .../tool-ui/citation/citation-list.tsx        |  31 ++--
 .../components/tool-ui/citation/citation.tsx  |  26 ++--
 surfsense_web/components/ui/animated-tabs.tsx |   5 +-
 surfsense_web/components/ui/toggle-group.tsx  |  10 +-
 surfsense_web/contexts/LocaleContext.tsx      |  13 +-
 surfsense_web/types/window.d.ts               |  13 +-
 54 files changed, 1015 insertions(+), 672 deletions(-)

diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py
index b177c2f8d..e89800191 100644
--- a/surfsense_backend/app/connectors/dropbox/client.py
+++ b/surfsense_backend/app/connectors/dropbox/client.py
@@ -225,9 +225,7 @@ class DropboxClient:
 
         return all_items, None
 
-    async def get_latest_cursor(
-        self, path: str = ""
-    ) -> tuple[str | None, str | None]:
+    async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]:
         """Get a cursor representing the current state of a folder.
 
         Uses /2/files/list_folder/get_latest_cursor so we can later call
@@ -251,9 +249,7 @@ class DropboxClient:
         """
         all_entries: list[dict[str, Any]] = []
 
-        resp = await self._request(
-            "/2/files/list_folder/continue", {"cursor": cursor}
-        )
+        resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor})
         if resp.status_code == 401:
             return [], None, "Dropbox authentication expired (401)"
         if resp.status_code != 200:
@@ -268,7 +264,11 @@ class DropboxClient:
                 "/2/files/list_folder/continue", {"cursor": cursor}
             )
             if resp.status_code != 200:
-                return all_entries, data.get("cursor"), f"Pagination failed: {resp.status_code}"
+                return (
+                    all_entries,
+                    data.get("cursor"),
+                    f"Pagination failed: {resp.status_code}",
+                )
             data = resp.json()
             all_entries.extend(data.get("entries", []))
 
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 025c3831a..83ff32e82 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -100,7 +100,9 @@ async def download_and_extract_content(
             if error:
                 return None, drive_metadata, error
 
-        etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name
+        etl_filename = (
+            file_name + extension if is_google_workspace_file(mime_type) else file_name
+        )
         markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
         return markdown, drive_metadata, None
 
@@ -233,7 +235,9 @@ async def download_and_process_file(
                 "."
             )[-1]
 
-        etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name
+        etl_filename = (
+            file_name + extension if is_google_workspace_file(mime_type) else file_name
+        )
         logger.info(f"Processing {file_name} with Surfsense's file processor")
         await process_file_in_background(
             file_path=temp_file_path,
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index a0041c843..6e7ab3c4c 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -1,6 +1,9 @@
 from app.config import config as app_config
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
-from app.etl_pipeline.exceptions import EtlServiceUnavailableError, EtlUnsupportedFileError
+from app.etl_pipeline.exceptions import (
+    EtlServiceUnavailableError,
+    EtlUnsupportedFileError,
+)
 from app.etl_pipeline.file_classifier import FileCategory, classify_file
 from app.etl_pipeline.parsers.audio import transcribe_audio
 from app.etl_pipeline.parsers.direct_convert import convert_file_directly
@@ -78,9 +81,7 @@ class EtlPipelineService:
                 request.file_path, request.estimated_pages
             )
         else:
-            raise EtlServiceUnavailableError(
-                f"Unknown ETL_SERVICE: {etl_service}"
-            )
+            raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
 
         return EtlResult(
             markdown_content=content,
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
index bc7b4537c..4e690bcdc 100644
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -1,27 +1,96 @@
 from enum import Enum
 from pathlib import PurePosixPath
 
-from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service
+from app.utils.file_extensions import (
+    DOCUMENT_EXTENSIONS,
+    get_document_extensions_for_service,
+)
 
 PLAINTEXT_EXTENSIONS = frozenset(
     {
-        ".md", ".markdown", ".txt", ".text",
-        ".json", ".jsonl", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".xml",
-        ".css", ".scss", ".less", ".sass",
-        ".py", ".pyw", ".pyi", ".pyx",
-        ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
-        ".java", ".kt", ".kts", ".scala", ".groovy",
-        ".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx",
-        ".cs", ".fs", ".fsx",
-        ".go", ".rs", ".rb", ".php", ".pl", ".pm", ".lua", ".swift",
-        ".m", ".mm", ".r", ".jl",
-        ".sh", ".bash", ".zsh", ".fish", ".bat", ".cmd", ".ps1",
-        ".sql", ".graphql", ".gql",
-        ".env", ".gitignore", ".dockerignore", ".editorconfig",
-        ".makefile", ".cmake",
-        ".log", ".rst", ".tex", ".bib", ".org", ".adoc", ".asciidoc",
-        ".vue", ".svelte", ".astro",
-        ".tf", ".hcl", ".proto",
+        ".md",
+        ".markdown",
+        ".txt",
+        ".text",
+        ".json",
+        ".jsonl",
+        ".yaml",
+        ".yml",
+        ".toml",
+        ".ini",
+        ".cfg",
+        ".conf",
+        ".xml",
+        ".css",
+        ".scss",
+        ".less",
+        ".sass",
+        ".py",
+        ".pyw",
+        ".pyi",
+        ".pyx",
+        ".js",
+        ".jsx",
+        ".ts",
+        ".tsx",
+        ".mjs",
+        ".cjs",
+        ".java",
+        ".kt",
+        ".kts",
+        ".scala",
+        ".groovy",
+        ".c",
+        ".h",
+        ".cpp",
+        ".cxx",
+        ".cc",
+        ".hpp",
+        ".hxx",
+        ".cs",
+        ".fs",
+        ".fsx",
+        ".go",
+        ".rs",
+        ".rb",
+        ".php",
+        ".pl",
+        ".pm",
+        ".lua",
+        ".swift",
+        ".m",
+        ".mm",
+        ".r",
+        ".jl",
+        ".sh",
+        ".bash",
+        ".zsh",
+        ".fish",
+        ".bat",
+        ".cmd",
+        ".ps1",
+        ".sql",
+        ".graphql",
+        ".gql",
+        ".env",
+        ".gitignore",
+        ".dockerignore",
+        ".editorconfig",
+        ".makefile",
+        ".cmake",
+        ".log",
+        ".rst",
+        ".tex",
+        ".bib",
+        ".org",
+        ".adoc",
+        ".asciidoc",
+        ".vue",
+        ".svelte",
+        ".astro",
+        ".tf",
+        ".hcl",
+        ".proto",
     }
 )
 
diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
index 5115aebea..ae2a34234 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@@ -66,16 +66,12 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
                     )
 
                 if hasattr(result, "get_markdown_documents"):
-                    markdown_docs = result.get_markdown_documents(
-                        split_by_page=False
-                    )
+                    markdown_docs = result.get_markdown_documents(split_by_page=False)
                     if markdown_docs and hasattr(markdown_docs[0], "text"):
                         return markdown_docs[0].text
                     if hasattr(result, "pages") and result.pages:
                         return "\n\n".join(
-                            p.md
-                            for p in result.pages
-                            if hasattr(p, "md") and p.md
+                            p.md for p in result.pages if hasattr(p, "md") and p.md
                         )
                     return str(result)
 
@@ -83,9 +79,7 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
                     if result and hasattr(result[0], "text"):
                         return result[0].text
                     return "\n\n".join(
-                        doc.page_content
-                        if hasattr(doc, "page_content")
-                        else str(doc)
+                        doc.page_content if hasattr(doc, "page_content") else str(doc)
                         for doc in result
                     )
 
diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py
index bb56709cb..a11b7dbc1 100644
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ b/surfsense_backend/app/routes/autocomplete_routes.py
@@ -1,4 +1,4 @@
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -31,8 +31,11 @@ async def vision_autocomplete_stream(
 
     return StreamingResponse(
         stream_vision_autocomplete(
-            body.screenshot, body.search_space_id, session,
-            app_name=body.app_name, window_title=body.window_title,
+            body.screenshot,
+            body.search_space_id,
+            session,
+            app_name=body.app_name,
+            window_title=body.window_title,
         ),
         media_type="text/event-stream",
         headers={
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index a30eb7297..bb20da65d 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -2647,7 +2647,12 @@ async def run_onedrive_indexing(
                 stage="fetching",
             )
 
-        total_indexed, total_skipped, error_message, total_unsupported = await index_onedrive_files(
+        (
+            total_indexed,
+            total_skipped,
+            error_message,
+            total_unsupported,
+        ) = await index_onedrive_files(
             session,
             connector_id,
             search_space_id,
@@ -2756,7 +2761,12 @@ async def run_dropbox_indexing(
                 stage="fetching",
             )
 
-        total_indexed, total_skipped, error_message, total_unsupported = await index_dropbox_files(
+        (
+            total_indexed,
+            total_skipped,
+            error_message,
+            total_unsupported,
+        ) = await index_dropbox_files(
             session,
             connector_id,
             search_space_id,
diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py
index f24a5c848..7e9408be7 100644
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@@ -1,5 +1,5 @@
 import logging
-from typing import AsyncGenerator
+from collections.abc import AsyncGenerator
 
 from langchain_core.messages import HumanMessage, SystemMessage
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -68,8 +68,10 @@ def _is_vision_unsupported_error(e: Exception) -> bool:
 
 
 async def _extract_query_from_screenshot(
-    llm, screenshot_data_url: str,
-    app_name: str = "", window_title: str = "",
+    llm,
+    screenshot_data_url: str,
+    app_name: str = "",
+    window_title: str = "",
 ) -> str | None:
     """Ask the Vision LLM to describe what the user is working on.
 
@@ -78,18 +80,26 @@ async def _extract_query_from_screenshot(
     """
     if app_name:
         prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
-            app_name=app_name, window_title=window_title,
+            app_name=app_name,
+            window_title=window_title,
         )
     else:
         prompt_text = EXTRACT_QUERY_PROMPT
 
     try:
-        response = await llm.ainvoke([
-            HumanMessage(content=[
-                {"type": "text", "text": prompt_text},
-                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
-            ]),
-        ])
+        response = await llm.ainvoke(
+            [
+                HumanMessage(
+                    content=[
+                        {"type": "text", "text": prompt_text},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": screenshot_data_url},
+                        },
+                    ]
+                ),
+            ]
+        )
         query = response.content.strip() if hasattr(response, "content") else ""
         return query if query else None
     except Exception as e:
@@ -167,10 +177,15 @@ async def stream_vision_autocomplete(
     kb_context = ""
     try:
         query = await _extract_query_from_screenshot(
-            llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+            llm,
+            screenshot_data_url,
+            app_name=app_name,
+            window_title=window_title,
         )
     except Exception as e:
-        logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+        logger.warning(
+            f"Vision autocomplete: selected model does not support vision: {e}"
+        )
         yield streaming.format_message_start()
         yield streaming.format_error(vision_error_msg)
         yield streaming.format_done()
@@ -183,16 +198,18 @@ async def stream_vision_autocomplete(
 
     messages = [
         SystemMessage(content=system_prompt),
-        HumanMessage(content=[
-            {
-                "type": "text",
-                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": screenshot_data_url},
-            },
-        ]),
+        HumanMessage(
+            content=[
+                {
+                    "type": "text",
+                    "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": screenshot_data_url},
+                },
+            ]
+        ),
     ]
 
     text_started = False
@@ -217,7 +234,9 @@ async def stream_vision_autocomplete(
             yield streaming.format_text_end(text_id)
 
         if _is_vision_unsupported_error(e):
-            logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+            logger.warning(
+                f"Vision autocomplete: selected model does not support vision: {e}"
+            )
             yield streaming.format_error(vision_error_msg)
         else:
             logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index 8d2a45e03..4a49944c2 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -254,9 +254,7 @@ async def _download_and_index(
     return batch_indexed, download_failed + batch_failed
 
 
-async def _remove_document(
-    session: AsyncSession, file_id: str, search_space_id: int
-):
+async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
     """Remove a document that was deleted in Dropbox."""
     primary_hash = compute_identifier_hash(
         DocumentType.DROPBOX_FILE.value, file_id, search_space_id
@@ -268,8 +266,7 @@ async def _remove_document(
             select(Document).where(
                 Document.search_space_id == search_space_id,
                 Document.document_type == DocumentType.DROPBOX_FILE,
-                cast(Document.document_metadata["dropbox_file_id"], String)
-                == file_id,
+                cast(Document.document_metadata["dropbox_file_id"], String) == file_id,
             )
         )
         existing = result.scalar_one_or_none()
@@ -671,9 +668,7 @@ async def index_dropbox_files(
 
             saved_cursor = folder_cursors.get(folder_path)
             can_use_delta = (
-                use_delta_sync
-                and saved_cursor
-                and connector.last_indexed_at
+                use_delta_sync and saved_cursor and connector.last_indexed_at
             )
 
             if can_use_delta:
@@ -739,7 +734,11 @@ async def index_dropbox_files(
         await task_logger.log_task_success(
             log_entry,
             f"Successfully completed Dropbox indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported},
+            {
+                "files_processed": total_indexed,
+                "files_skipped": total_skipped,
+                "files_unsupported": total_unsupported,
+            },
         )
         logger.info(
             f"Dropbox indexing completed: {total_indexed} indexed, "
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index 9916e70a0..b11087fe6 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -1010,7 +1010,11 @@ async def index_google_drive_files(
             documents_unsupported += ru
         else:
             logger.info(f"Using full scan for connector {connector_id}")
-            documents_indexed, documents_skipped, documents_unsupported = await _index_full_scan(
+            (
+                documents_indexed,
+                documents_skipped,
+                documents_unsupported,
+            ) = await _index_full_scan(
                 drive_client,
                 session,
                 connector,
@@ -1301,7 +1305,12 @@ async def index_google_drive_selected_files(
                 log_entry,
                 f"Batch file indexing completed with {len(errors)} error(s)",
                 "; ".join(errors),
-                {"indexed": indexed, "skipped": skipped, "unsupported": unsupported, "error_count": len(errors)},
+                {
+                    "indexed": indexed,
+                    "skipped": skipped,
+                    "unsupported": unsupported,
+                    "error_count": len(errors),
+                },
             )
         else:
             await task_logger.log_task_success(
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index f4366fb78..7f42f4638 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -23,7 +23,6 @@ from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError, SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.config import config
 from app.db import (
     Document,
     DocumentStatus,
@@ -153,8 +152,6 @@ def scan_folder(
     return files
 
 
-
-
 async def _read_file_content(file_path: str, filename: str) -> str:
     """Read file content via the unified ETL pipeline.
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
index b26442490..06517f542 100644
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@@ -762,7 +762,11 @@ async def index_onedrive_files(
         await task_logger.log_task_success(
             log_entry,
             f"Successfully completed OneDrive indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported},
+            {
+                "files_processed": total_indexed,
+                "files_skipped": total_skipped,
+                "files_unsupported": total_unsupported,
+            },
         )
         logger.info(
             f"OneDrive indexing completed: {total_indexed} indexed, "
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index a9a6b62be..c765dbd87 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -292,8 +292,10 @@ async def process_file_in_background(
     )
 
     try:
-        from app.etl_pipeline.file_classifier import FileCategory as EtlFileCategory
-        from app.etl_pipeline.file_classifier import classify_file as etl_classify
+        from app.etl_pipeline.file_classifier import (
+            FileCategory as EtlFileCategory,
+            classify_file as etl_classify,
+        )
 
         category = etl_classify(filename)
 
@@ -345,8 +347,10 @@ async def _extract_file_content(
     """
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    from app.etl_pipeline.file_classifier import FileCategory
-    from app.etl_pipeline.file_classifier import classify_file as etl_classify
+    from app.etl_pipeline.file_classifier import (
+        FileCategory,
+        classify_file as etl_classify,
+    )
 
     category = etl_classify(filename)
     estimated_pages = 0
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
index 5eed36872..8d432ce56 100644
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -15,30 +15,83 @@ from pathlib import PurePosixPath
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
 
-DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
-    ".pdf",
-    ".docx", ".xlsx", ".pptx",
-    ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp",
-})
+DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".pdf",
+        ".docx",
+        ".xlsx",
+        ".pptx",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".tiff",
+        ".tif",
+        ".bmp",
+        ".webp",
+    }
+)
 
-LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
-    ".pdf",
-    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
-    ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx",
-    ".xlsm", ".xlsb", ".xlw",
-    ".rtf", ".epub",
-    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg",
-    ".odt", ".ods", ".odp",
-    ".hwp", ".hwpx",
-})
+LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".pdf",
+        ".docx",
+        ".doc",
+        ".xlsx",
+        ".xls",
+        ".pptx",
+        ".ppt",
+        ".docm",
+        ".dot",
+        ".dotm",
+        ".pptm",
+        ".pot",
+        ".potx",
+        ".xlsm",
+        ".xlsb",
+        ".xlw",
+        ".rtf",
+        ".epub",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".odt",
+        ".ods",
+        ".odp",
+        ".hwp",
+        ".hwpx",
+    }
+)
 
-UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({
-    ".pdf",
-    ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
-    ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic",
-    ".rtf", ".epub", ".odt",
-    ".eml", ".msg", ".p7s",
-})
+UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".pdf",
+        ".docx",
+        ".doc",
+        ".xlsx",
+        ".xls",
+        ".pptx",
+        ".ppt",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".heic",
+        ".rtf",
+        ".epub",
+        ".odt",
+        ".eml",
+        ".msg",
+        ".p7s",
+    }
+)
 
 # ---------------------------------------------------------------------------
 # Union (used by classify_file for routing) + service lookup
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
index 49f9a217a..cd112e09f 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
@@ -6,7 +6,6 @@ real so we know the full path from "cloud gives us bytes" to "we get markdown
 back" actually works.
 """
 
-import os
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -21,6 +20,7 @@ _CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
 # Helpers
 # ---------------------------------------------------------------------------
 
+
 async def _write_file(dest_path: str, content: str) -> None:
     """Simulate a cloud client writing downloaded bytes to disk."""
     with open(dest_path, "w", encoding="utf-8") as f:
@@ -43,8 +43,8 @@ def _make_download_side_effect(content: str):
 # Google Drive
 # ===================================================================
 
-class TestGoogleDriveContentExtraction:
 
+class TestGoogleDriveContentExtraction:
     async def test_txt_file_returns_markdown(self):
         from app.connectors.google_drive.content_extractor import (
             download_and_extract_content,
@@ -76,7 +76,7 @@ class TestGoogleDriveContentExtraction:
 
         file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert error is None
         assert "Alice" in markdown
@@ -93,7 +93,7 @@ class TestGoogleDriveContentExtraction:
 
         file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert markdown is None
         assert error == "Network timeout"
@@ -103,8 +103,8 @@ class TestGoogleDriveContentExtraction:
 # OneDrive
 # ===================================================================
 
-class TestOneDriveContentExtraction:
 
+class TestOneDriveContentExtraction:
     async def test_txt_file_returns_markdown(self):
         from app.connectors.onedrive.content_extractor import (
             download_and_extract_content,
@@ -144,7 +144,7 @@ class TestOneDriveContentExtraction:
             "file": {"mimeType": "text/csv"},
         }
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert error is None
         assert "Alice" in markdown
@@ -164,7 +164,7 @@ class TestOneDriveContentExtraction:
             "file": {"mimeType": "text/plain"},
         }
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert markdown is None
         assert error == "403 Forbidden"
@@ -174,8 +174,8 @@ class TestOneDriveContentExtraction:
 # Dropbox
 # ===================================================================
 
-class TestDropboxContentExtraction:
 
+class TestDropboxContentExtraction:
     async def test_txt_file_returns_markdown(self):
         from app.connectors.dropbox.content_extractor import (
             download_and_extract_content,
@@ -217,7 +217,7 @@ class TestDropboxContentExtraction:
             "path_lower": "/data.csv",
         }
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert error is None
         assert "Alice" in markdown
@@ -238,7 +238,7 @@ class TestDropboxContentExtraction:
             "path_lower": "/big.txt",
         }
 
-        markdown, metadata, error = await download_and_extract_content(client, file)
+        markdown, _metadata, error = await download_and_extract_content(client, file)
 
         assert markdown is None
         assert error == "Rate limited"
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index adac90085..f72135d05 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -265,6 +265,7 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch):
 
     async def _fake_skip(session, file, search_space_id):
         from app.connectors.dropbox.file_types import should_skip_file as _skip
+
         item_skip, unsup_ext = _skip(file)
         if item_skip:
             if unsup_ext:
@@ -468,7 +469,11 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
 
     indexed, skipped, _unsupported, errors = await _run_selected(
         selected_files_mocks,
-        [("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")],
+        [
+            ("/first.txt", "first.txt"),
+            ("/mid.txt", "mid.txt"),
+            ("/third.txt", "third.txt"),
+        ],
     )
 
     assert indexed == 2
@@ -526,8 +531,18 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
     import app.tasks.connector_indexers.dropbox_indexer as _mod
 
     entries = [
-        {".tag": "deleted", "name": "gone.txt", "path_lower": "/gone.txt", "id": "id:del1"},
-        {".tag": "deleted", "name": "also_gone.pdf", "path_lower": "/also_gone.pdf", "id": "id:del2"},
+        {
+            ".tag": "deleted",
+            "name": "gone.txt",
+            "path_lower": "/gone.txt",
+            "id": "id:del1",
+        },
+        {
+            ".tag": "deleted",
+            "name": "also_gone.pdf",
+            "path_lower": "/also_gone.pdf",
+            "id": "id:del2",
+        },
     ]
 
     mock_client = MagicMock()
@@ -544,7 +559,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
+    _indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -573,7 +588,9 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
     mock_client = MagicMock()
     mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
 
-    monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
+    monkeypatch.setattr(
+        _mod, "_should_skip_file", AsyncMock(return_value=(False, None))
+    )
 
     download_mock = AsyncMock(return_value=(2, 0))
     monkeypatch.setattr(_mod, "_download_and_index", download_mock)
@@ -581,7 +598,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
+    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -608,8 +625,18 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
     import app.tasks.connector_indexers.dropbox_indexer as _mod
 
     entries = [
-        {".tag": "deleted", "name": "removed.txt", "path_lower": "/removed.txt", "id": "id:del1"},
-        {".tag": "deleted", "name": "trashed.pdf", "path_lower": "/trashed.pdf", "id": "id:del2"},
+        {
+            ".tag": "deleted",
+            "name": "removed.txt",
+            "path_lower": "/removed.txt",
+            "id": "id:del1",
+        },
+        {
+            ".tag": "deleted",
+            "name": "trashed.pdf",
+            "path_lower": "/trashed.pdf",
+            "id": "id:del2",
+        },
         _make_file_dict("mod1", "updated.txt"),
         _make_file_dict("new1", "brandnew.docx"),
     ]
@@ -623,7 +650,9 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
         remove_calls.append(file_id)
 
     monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
-    monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None)))
+    monkeypatch.setattr(
+        _mod, "_should_skip_file", AsyncMock(return_value=(False, None))
+    )
 
     download_mock = AsyncMock(return_value=(2, 0))
     monkeypatch.setattr(_mod, "_download_and_index", download_mock)
@@ -631,7 +660,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
+    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -665,7 +694,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, unsupported, cursor = await _index_with_delta_sync(
+    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
         mock_client,
         AsyncMock(),
         _CONNECTOR_ID,
@@ -723,9 +752,7 @@ def orchestrator_mocks(monkeypatch):
 
     mock_client = MagicMock()
     mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
-    monkeypatch.setattr(
-        _mod, "DropboxClient", MagicMock(return_value=mock_client)
-    )
+    monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client))
 
     return {
         "connector": mock_connector,
@@ -751,7 +778,7 @@ async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
     mock_session = AsyncMock()
     mock_session.commit = AsyncMock()
 
-    indexed, skipped, error, _unsupported = await index_dropbox_files(
+    _indexed, _skipped, error, _unsupported = await index_dropbox_files(
         mock_session,
         _CONNECTOR_ID,
         _SEARCH_SPACE_ID,
@@ -779,7 +806,7 @@ async def test_orchestrator_falls_back_to_full_scan_without_cursor(
     mock_session = AsyncMock()
     mock_session.commit = AsyncMock()
 
-    indexed, skipped, error, _unsupported = await index_dropbox_files(
+    _indexed, _skipped, error, _unsupported = await index_dropbox_files(
         mock_session,
         _CONNECTOR_ID,
         _SEARCH_SPACE_ID,
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
index 7fa92ce12..0ae096361 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
     full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
     full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
 
-    indexed, skipped, unsupported = await _run_full_scan(full_scan_mocks)
+    indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks)
 
     assert indexed == 3  # 1 renamed + 2 from batch
     assert skipped == 1  # 1 unchanged
@@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
     mock_task_logger = MagicMock()
     mock_task_logger.log_task_progress = AsyncMock()
 
-    indexed, skipped, unsupported = await _index_with_delta_sync(
+    indexed, skipped, _unsupported = await _index_with_delta_sync(
         MagicMock(),
         mock_session,
         MagicMock(),
@@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
 
-    indexed, skipped, unsup, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
         selected_files_mocks,
         [("f1", "report.pdf")],
     )
@@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
     )
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, unsup, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
         selected_files_mocks,
         [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
     )
@@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
 
     selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
 
-    indexed, skipped, unsup, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
         selected_files_mocks,
         [
             ("s1", "unchanged.txt"),
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
index 58737b20b..573ee43d8 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
@@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
         None,
     )
 
-    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
+        m, [("big", "huge.pdf")]
+    )
 
     assert indexed == 0
     assert len(errors) == 1
@@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
         None,
     )
 
-    indexed, _skipped, _unsup, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_onedrive_selected(
+        m, [("big", "huge.pdf")]
+    )
 
     assert indexed == 0
     assert len(errors) == 1
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
index efacbcf72..31cafe550 100644
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
@@ -19,6 +19,7 @@ def _make_client() -> DropboxClient:
 
 # ---------- C1: get_latest_cursor ----------
 
+
 async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
     client = _make_client()
 
@@ -34,12 +35,17 @@ async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
     assert error is None
     client._request.assert_called_once_with(
         "/2/files/list_folder/get_latest_cursor",
-        {"path": "/my-folder", "recursive": False, "include_non_downloadable_files": True},
+        {
+            "path": "/my-folder",
+            "recursive": False,
+            "include_non_downloadable_files": True,
+        },
     )
 
 
 # ---------- C2: get_changes returns entries and new cursor ----------
 
+
 async def test_get_changes_returns_entries_and_cursor(monkeypatch):
     client = _make_client()
 
@@ -66,6 +72,7 @@ async def test_get_changes_returns_entries_and_cursor(monkeypatch):
 
 # ---------- C3: get_changes handles pagination ----------
 
+
 async def test_get_changes_handles_pagination(monkeypatch):
     client = _make_client()
 
@@ -98,6 +105,7 @@ async def test_get_changes_handles_pagination(monkeypatch):
 
 # ---------- C4: get_changes raises on 401 ----------
 
+
 async def test_get_changes_returns_error_on_401(monkeypatch):
     client = _make_client()
 
diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
index 74277d47c..b4715e083 100644
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@@ -41,15 +41,40 @@ def test_non_downloadable_item_is_skipped():
 @pytest.mark.parametrize(
     "filename",
     [
-        "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z",
-        "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso",
-        "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv",
+        "archive.zip",
+        "backup.tar",
+        "data.gz",
+        "stuff.rar",
+        "pack.7z",
+        "program.exe",
+        "lib.dll",
+        "module.so",
+        "image.dmg",
+        "disk.iso",
+        "movie.mov",
+        "clip.avi",
+        "video.mkv",
+        "film.wmv",
+        "stream.flv",
         "favicon.ico",
-        "raw.cr2", "photo.nef", "image.arw", "pic.dng",
-        "design.psd", "vector.ai", "mockup.sketch", "proto.fig",
-        "font.ttf", "font.otf", "font.woff", "font.woff2",
-        "model.stl", "scene.fbx", "mesh.blend",
-        "local.db", "data.sqlite", "access.mdb",
+        "raw.cr2",
+        "photo.nef",
+        "image.arw",
+        "pic.dng",
+        "design.psd",
+        "vector.ai",
+        "mockup.sketch",
+        "proto.fig",
+        "font.ttf",
+        "font.otf",
+        "font.woff",
+        "font.woff2",
+        "model.stl",
+        "scene.fbx",
+        "mesh.blend",
+        "local.db",
+        "data.sqlite",
+        "access.mdb",
     ],
 )
 def test_non_parseable_extensions_are_skipped(filename, mocker):
@@ -63,9 +88,16 @@ def test_non_parseable_extensions_are_skipped(filename, mocker):
 @pytest.mark.parametrize(
     "filename",
     [
-        "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx",
-        "readme.txt", "data.csv", "page.html", "notes.md",
-        "config.json", "feed.xml",
+        "report.pdf",
+        "document.docx",
+        "sheet.xlsx",
+        "slides.pptx",
+        "readme.txt",
+        "data.csv",
+        "page.html",
+        "notes.md",
+        "config.json",
+        "feed.xml",
     ],
 )
 def test_parseable_documents_are_not_skipped(filename, mocker):
@@ -92,30 +124,33 @@ def test_universal_images_are_not_skipped(filename, mocker):
         assert ext is None
 
 
-@pytest.mark.parametrize("filename,service,expected_skip", [
-    ("old.doc", "DOCLING", True),
-    ("old.doc", "LLAMACLOUD", False),
-    ("old.doc", "UNSTRUCTURED", False),
-    ("legacy.xls", "DOCLING", True),
-    ("legacy.xls", "LLAMACLOUD", False),
-    ("legacy.xls", "UNSTRUCTURED", False),
-    ("deck.ppt", "DOCLING", True),
-    ("deck.ppt", "LLAMACLOUD", False),
-    ("deck.ppt", "UNSTRUCTURED", False),
-    ("icon.svg", "DOCLING", True),
-    ("icon.svg", "LLAMACLOUD", False),
-    ("anim.gif", "DOCLING", True),
-    ("anim.gif", "LLAMACLOUD", False),
-    ("photo.webp", "DOCLING", False),
-    ("photo.webp", "LLAMACLOUD", False),
-    ("photo.webp", "UNSTRUCTURED", True),
-    ("live.heic", "DOCLING", True),
-    ("live.heic", "UNSTRUCTURED", False),
-    ("macro.docm", "DOCLING", True),
-    ("macro.docm", "LLAMACLOUD", False),
-    ("mail.eml", "DOCLING", True),
-    ("mail.eml", "UNSTRUCTURED", False),
-])
+@pytest.mark.parametrize(
+    "filename,service,expected_skip",
+    [
+        ("old.doc", "DOCLING", True),
+        ("old.doc", "LLAMACLOUD", False),
+        ("old.doc", "UNSTRUCTURED", False),
+        ("legacy.xls", "DOCLING", True),
+        ("legacy.xls", "LLAMACLOUD", False),
+        ("legacy.xls", "UNSTRUCTURED", False),
+        ("deck.ppt", "DOCLING", True),
+        ("deck.ppt", "LLAMACLOUD", False),
+        ("deck.ppt", "UNSTRUCTURED", False),
+        ("icon.svg", "DOCLING", True),
+        ("icon.svg", "LLAMACLOUD", False),
+        ("anim.gif", "DOCLING", True),
+        ("anim.gif", "LLAMACLOUD", False),
+        ("photo.webp", "DOCLING", False),
+        ("photo.webp", "LLAMACLOUD", False),
+        ("photo.webp", "UNSTRUCTURED", True),
+        ("live.heic", "DOCLING", True),
+        ("live.heic", "UNSTRUCTURED", False),
+        ("macro.docm", "DOCLING", True),
+        ("macro.docm", "LLAMACLOUD", False),
+        ("mail.eml", "DOCLING", True),
+        ("mail.eml", "UNSTRUCTURED", False),
+    ],
+)
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {".tag": "file", "name": filename}
diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
index 5cd43736b..ab602468d 100644
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@@ -7,21 +7,37 @@ from app.connectors.google_drive.file_types import should_skip_by_extension
 pytestmark = pytest.mark.unit
 
 
-@pytest.mark.parametrize("filename", [
-    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "malware.exe",
+        "archive.zip",
+        "video.mov",
+        "font.woff2",
+        "model.blend",
+    ],
+)
 def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
     """Truly unsupported files are skipped no matter which ETL service is configured."""
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
-        skip, ext = should_skip_by_extension(filename)
+        skip, _ext = should_skip_by_extension(filename)
         assert skip is True
 
 
-@pytest.mark.parametrize("filename", [
-    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
-    "readme.txt", "data.csv", "photo.png", "notes.md",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "report.pdf",
+        "doc.docx",
+        "sheet.xlsx",
+        "slides.pptx",
+        "readme.txt",
+        "data.csv",
+        "photo.png",
+        "notes.md",
+    ],
+)
 def test_universal_extensions_are_not_skipped(filename, mocker):
     """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
@@ -31,16 +47,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker):
         assert ext is None
 
 
-@pytest.mark.parametrize("filename,service,expected_skip", [
-    ("macro.docm", "DOCLING", True),
-    ("macro.docm", "LLAMACLOUD", False),
-    ("mail.eml", "DOCLING", True),
-    ("mail.eml", "UNSTRUCTURED", False),
-    ("photo.gif", "DOCLING", True),
-    ("photo.gif", "LLAMACLOUD", False),
-    ("photo.heic", "UNSTRUCTURED", False),
-    ("photo.heic", "DOCLING", True),
-])
+@pytest.mark.parametrize(
+    "filename,service,expected_skip",
+    [
+        ("macro.docm", "DOCLING", True),
+        ("macro.docm", "LLAMACLOUD", False),
+        ("mail.eml", "DOCLING", True),
+        ("mail.eml", "UNSTRUCTURED", False),
+        ("photo.gif", "DOCLING", True),
+        ("photo.gif", "LLAMACLOUD", False),
+        ("photo.heic", "UNSTRUCTURED", False),
+        ("photo.heic", "DOCLING", True),
+    ],
+)
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
     skip, ext = should_skip_by_extension(filename)
diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
index 61212b340..1d9124c47 100644
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@@ -45,9 +45,16 @@ def test_onenote_is_skipped():
 # ---------------------------------------------------------------------------
 
 
-@pytest.mark.parametrize("filename", [
-    "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "malware.exe",
+        "archive.zip",
+        "video.mov",
+        "font.woff2",
+        "model.blend",
+    ],
+)
 def test_unsupported_extensions_are_skipped(filename, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
@@ -56,10 +63,19 @@ def test_unsupported_extensions_are_skipped(filename, mocker):
     assert ext is not None
 
 
-@pytest.mark.parametrize("filename", [
-    "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx",
-    "readme.txt", "data.csv", "photo.png", "notes.md",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "report.pdf",
+        "doc.docx",
+        "sheet.xlsx",
+        "slides.pptx",
+        "readme.txt",
+        "data.csv",
+        "photo.png",
+        "notes.md",
+    ],
+)
 def test_universal_files_are_not_skipped(filename, mocker):
     for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
         mocker.patch("app.config.config.ETL_SERVICE", service)
@@ -69,14 +85,17 @@ def test_universal_files_are_not_skipped(filename, mocker):
         assert ext is None
 
 
-@pytest.mark.parametrize("filename,service,expected_skip", [
-    ("macro.docm", "DOCLING", True),
-    ("macro.docm", "LLAMACLOUD", False),
-    ("mail.eml", "DOCLING", True),
-    ("mail.eml", "UNSTRUCTURED", False),
-    ("photo.heic", "UNSTRUCTURED", False),
-    ("photo.heic", "DOCLING", True),
-])
+@pytest.mark.parametrize(
+    "filename,service,expected_skip",
+    [
+        ("macro.docm", "DOCLING", True),
+        ("macro.docm", "LLAMACLOUD", False),
+        ("mail.eml", "DOCLING", True),
+        ("mail.eml", "UNSTRUCTURED", False),
+        ("photo.heic", "UNSTRUCTURED", False),
+        ("photo.heic", "DOCLING", True),
+    ],
+)
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
     mocker.patch("app.config.config.ETL_SERVICE", service)
     item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
diff --git a/surfsense_backend/tests/unit/etl_pipeline/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/conftest.py
index 6059caa01..082ab9771 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/conftest.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py
@@ -24,6 +24,4 @@ def _stub_package(dotted: str, fs_dir: Path) -> None:
 
 _stub_package("app", _BACKEND / "app")
 _stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
-_stub_package(
-    "app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers"
-)
+_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers")
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index e90847e3a..769b1dc53 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -144,7 +144,7 @@ async def test_extract_mp3_returns_transcription(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 7 – DOCLING document parsing
+# Slice 7 - DOCLING document parsing
 # ---------------------------------------------------------------------------
 
 
@@ -172,7 +172,7 @@ async def test_extract_pdf_with_docling(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 8 – UNSTRUCTURED document parsing
+# Slice 8 - UNSTRUCTURED document parsing
 # ---------------------------------------------------------------------------
 
 
@@ -208,7 +208,7 @@ async def test_extract_pdf_with_unstructured(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 9 – LLAMACLOUD document parsing
+# Slice 9 - LLAMACLOUD document parsing
 # ---------------------------------------------------------------------------
 
 
@@ -241,9 +241,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
     )
 
     result = await EtlPipelineService().extract(
-        EtlRequest(
-            file_path=str(pdf_file), filename="report.pdf", estimated_pages=5
-        )
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
     )
 
     assert result.markdown_content == "# LlamaCloud parsed"
@@ -252,7 +250,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 10 – unknown extension falls through to document ETL
+# Slice 10 - unknown extension falls through to document ETL
 # ---------------------------------------------------------------------------
 
 
@@ -279,18 +277,18 @@ async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 11 – EtlRequest validation
+# Slice 11 - EtlRequest validation
 # ---------------------------------------------------------------------------
 
 
 def test_etl_request_requires_filename():
     """EtlRequest rejects missing filename."""
-    with pytest.raises(Exception):
+    with pytest.raises(ValueError, match="filename must not be empty"):
         EtlRequest(file_path="/tmp/some.txt", filename="")
 
 
 # ---------------------------------------------------------------------------
-# Slice 12 – unknown ETL_SERVICE raises EtlServiceUnavailableError
+# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError
 # ---------------------------------------------------------------------------
 
 
@@ -310,7 +308,7 @@ async def test_unknown_etl_service_raises(tmp_path, mocker):
 
 
 # ---------------------------------------------------------------------------
-# Slice 13 – unsupported file types are rejected before reaching any parser
+# Slice 13 - unsupported file types are rejected before reaching any parser
 # ---------------------------------------------------------------------------
 
 
@@ -321,10 +319,19 @@ def test_unknown_extension_classified_as_unsupported():
     assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
 
 
-@pytest.mark.parametrize("filename", [
-    "malware.exe", "archive.zip", "video.mov", "font.woff2",
-    "model.blend", "data.parquet", "package.deb", "firmware.bin",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "malware.exe",
+        "archive.zip",
+        "video.mov",
+        "font.woff2",
+        "model.blend",
+        "data.parquet",
+        "package.deb",
+        "firmware.bin",
+    ],
+)
 def test_unsupported_extensions_classified_correctly(filename):
     """Extensions not in any allowlist are classified as UNSUPPORTED."""
     from app.etl_pipeline.file_classifier import FileCategory, classify_file
@@ -332,18 +339,21 @@ def test_unsupported_extensions_classified_correctly(filename):
     assert classify_file(filename) == FileCategory.UNSUPPORTED
 
 
-@pytest.mark.parametrize("filename,expected", [
-    ("report.pdf", "document"),
-    ("doc.docx", "document"),
-    ("slides.pptx", "document"),
-    ("sheet.xlsx", "document"),
-    ("photo.png", "document"),
-    ("photo.jpg", "document"),
-    ("book.epub", "document"),
-    ("letter.odt", "document"),
-    ("readme.md", "plaintext"),
-    ("data.csv", "direct_convert"),
-])
+@pytest.mark.parametrize(
+    "filename,expected",
+    [
+        ("report.pdf", "document"),
+        ("doc.docx", "document"),
+        ("slides.pptx", "document"),
+        ("sheet.xlsx", "document"),
+        ("photo.png", "document"),
+        ("photo.jpg", "document"),
+        ("book.epub", "document"),
+        ("letter.odt", "document"),
+        ("readme.md", "plaintext"),
+        ("data.csv", "direct_convert"),
+    ],
+)
 def test_parseable_extensions_classified_correctly(filename, expected):
     """Parseable files are classified into their correct category."""
     from app.etl_pipeline.file_classifier import FileCategory, classify_file
@@ -380,31 +390,34 @@ async def test_extract_zip_raises_unsupported_error(tmp_path):
 
 
 # ---------------------------------------------------------------------------
-# Slice 14 – should_skip_for_service (per-parser document filtering)
+# Slice 14 - should_skip_for_service (per-parser document filtering)
 # ---------------------------------------------------------------------------
 
 
-@pytest.mark.parametrize("filename,etl_service,expected_skip", [
-    ("file.eml", "DOCLING", True),
-    ("file.eml", "UNSTRUCTURED", False),
-    ("file.docm", "LLAMACLOUD", False),
-    ("file.docm", "DOCLING", True),
-    ("file.txt", "DOCLING", False),
-    ("file.csv", "LLAMACLOUD", False),
-    ("file.mp3", "UNSTRUCTURED", False),
-    ("file.exe", "LLAMACLOUD", True),
-    ("file.pdf", "DOCLING", False),
-    ("file.webp", "DOCLING", False),
-    ("file.webp", "UNSTRUCTURED", True),
-    ("file.gif", "LLAMACLOUD", False),
-    ("file.gif", "DOCLING", True),
-    ("file.heic", "UNSTRUCTURED", False),
-    ("file.heic", "DOCLING", True),
-    ("file.svg", "LLAMACLOUD", False),
-    ("file.svg", "DOCLING", True),
-    ("file.p7s", "UNSTRUCTURED", False),
-    ("file.p7s", "LLAMACLOUD", True),
-])
+@pytest.mark.parametrize(
+    "filename,etl_service,expected_skip",
+    [
+        ("file.eml", "DOCLING", True),
+        ("file.eml", "UNSTRUCTURED", False),
+        ("file.docm", "LLAMACLOUD", False),
+        ("file.docm", "DOCLING", True),
+        ("file.txt", "DOCLING", False),
+        ("file.csv", "LLAMACLOUD", False),
+        ("file.mp3", "UNSTRUCTURED", False),
+        ("file.exe", "LLAMACLOUD", True),
+        ("file.pdf", "DOCLING", False),
+        ("file.webp", "DOCLING", False),
+        ("file.webp", "UNSTRUCTURED", True),
+        ("file.gif", "LLAMACLOUD", False),
+        ("file.gif", "DOCLING", True),
+        ("file.heic", "UNSTRUCTURED", False),
+        ("file.heic", "DOCLING", True),
+        ("file.svg", "LLAMACLOUD", False),
+        ("file.svg", "DOCLING", True),
+        ("file.p7s", "UNSTRUCTURED", False),
+        ("file.p7s", "LLAMACLOUD", True),
+    ],
+)
 def test_should_skip_for_service(filename, etl_service, expected_skip):
     from app.etl_pipeline.file_classifier import should_skip_for_service
 
@@ -414,7 +427,7 @@ def test_should_skip_for_service(filename, etl_service, expected_skip):
 
 
 # ---------------------------------------------------------------------------
-# Slice 14b – ETL pipeline rejects per-parser incompatible documents
+# Slice 14b - ETL pipeline rejects per-parser incompatible documents
 # ---------------------------------------------------------------------------
 
 
diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py
index 430adbaf2..11ffc0ed1 100644
--- a/surfsense_backend/tests/unit/services/test_docling_image_support.py
+++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py
@@ -30,26 +30,29 @@ def test_docling_service_does_not_restrict_allowed_formats():
 
     fake_pdf_format_option_cls = MagicMock()
 
-    with patch.dict("sys.modules", {
-        "docling": MagicMock(),
-        "docling.backend": MagicMock(),
-        "docling.backend.pypdfium2_backend": MagicMock(
-            PyPdfiumDocumentBackend=mock_backend
-        ),
-        "docling.datamodel": MagicMock(),
-        "docling.datamodel.base_models": MagicMock(
-            InputFormat=_FakeInputFormat
-        ),
-        "docling.datamodel.pipeline_options": MagicMock(
-            PdfPipelineOptions=fake_pipeline_options_cls
-        ),
-        "docling.document_converter": MagicMock(
-            DocumentConverter=mock_converter_cls,
-            PdfFormatOption=fake_pdf_format_option_cls,
-        ),
-    }):
-        import app.services.docling_service as mod
+    with patch.dict(
+        "sys.modules",
+        {
+            "docling": MagicMock(),
+            "docling.backend": MagicMock(),
+            "docling.backend.pypdfium2_backend": MagicMock(
+                PyPdfiumDocumentBackend=mock_backend
+            ),
+            "docling.datamodel": MagicMock(),
+            "docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat),
+            "docling.datamodel.pipeline_options": MagicMock(
+                PdfPipelineOptions=fake_pipeline_options_cls
+            ),
+            "docling.document_converter": MagicMock(
+                DocumentConverter=mock_converter_cls,
+                PdfFormatOption=fake_pdf_format_option_cls,
+            ),
+        },
+    ):
         from importlib import reload
+
+        import app.services.docling_service as mod
+
         reload(mod)
 
         mod.DoclingService()
diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py
index acd8945ce..c33b39f05 100644
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@@ -17,36 +17,74 @@ def test_exe_is_not_supported_document():
     assert is_supported_document_extension("malware.exe") is False
 
 
-@pytest.mark.parametrize("filename", [
-    "report.pdf", "doc.docx", "old.doc",
-    "sheet.xlsx", "legacy.xls",
-    "slides.pptx", "deck.ppt",
-    "macro.docm", "macro.xlsm", "macro.pptm",
-    "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif",
-    "photo.webp", "anim.gif", "iphone.heic",
-    "manual.rtf", "book.epub",
-    "letter.odt", "data.ods", "presentation.odp",
-    "inbox.eml", "outlook.msg",
-    "korean.hwpx", "korean.hwp",
-    "template.dot", "template.dotm",
-    "template.pot", "template.potx",
-    "binary.xlsb", "workspace.xlw",
-    "vector.svg", "signature.p7s",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "report.pdf",
+        "doc.docx",
+        "old.doc",
+        "sheet.xlsx",
+        "legacy.xls",
+        "slides.pptx",
+        "deck.ppt",
+        "macro.docm",
+        "macro.xlsm",
+        "macro.pptm",
+        "photo.png",
+        "photo.jpg",
+        "photo.jpeg",
+        "scan.bmp",
+        "scan.tiff",
+        "scan.tif",
+        "photo.webp",
+        "anim.gif",
+        "iphone.heic",
+        "manual.rtf",
+        "book.epub",
+        "letter.odt",
+        "data.ods",
+        "presentation.odp",
+        "inbox.eml",
+        "outlook.msg",
+        "korean.hwpx",
+        "korean.hwp",
+        "template.dot",
+        "template.dotm",
+        "template.pot",
+        "template.potx",
+        "binary.xlsb",
+        "workspace.xlw",
+        "vector.svg",
+        "signature.p7s",
+    ],
+)
 def test_document_extensions_are_supported(filename):
     from app.utils.file_extensions import is_supported_document_extension
 
-    assert is_supported_document_extension(filename) is True, f"{filename} should be supported"
+    assert is_supported_document_extension(filename) is True, (
+        f"{filename} should be supported"
+    )
 
 
-@pytest.mark.parametrize("filename", [
-    "malware.exe", "archive.zip", "video.mov", "font.woff2",
-    "model.blend", "random.xyz", "data.parquet", "package.deb",
-])
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "malware.exe",
+        "archive.zip",
+        "video.mov",
+        "font.woff2",
+        "model.blend",
+        "random.xyz",
+        "data.parquet",
+        "package.deb",
+    ],
+)
 def test_non_document_extensions_are_not_supported(filename):
     from app.utils.file_extensions import is_supported_document_extension
 
-    assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported"
+    assert is_supported_document_extension(filename) is False, (
+        f"{filename} should NOT be supported"
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -67,7 +105,7 @@ def test_union_equals_all_three_sets():
         | LLAMAPARSE_DOCUMENT_EXTENSIONS
         | UNSTRUCTURED_DOCUMENT_EXTENSIONS
     )
-    assert DOCUMENT_EXTENSIONS == expected
+    assert expected == DOCUMENT_EXTENSIONS
 
 
 def test_get_extensions_for_docling():
diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
index 1522e153f..957ae9dae 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
@@ -3,8 +3,8 @@
 import { useEffect, useState } from "react";
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Label } from "@/components/ui/label";
-import { Switch } from "@/components/ui/switch";
 import { Spinner } from "@/components/ui/spinner";
+import { Switch } from "@/components/ui/switch";
 
 export function DesktopContent() {
 	const [isElectron, setIsElectron] = useState(false);
@@ -66,11 +66,7 @@ export function DesktopContent() {
 								Show suggestions while typing in other applications.
 							</p>
 						</div>
-						<Switch
-							id="autocomplete-toggle"
-							checked={enabled}
-							onCheckedChange={handleToggle}
-						/>
+						<Switch id="autocomplete-toggle" checked={enabled} onCheckedChange={handleToggle} />
 					</div>
 				</CardContent>
 			</Card>
diff --git a/surfsense_web/app/desktop/permissions/page.tsx b/surfsense_web/app/desktop/permissions/page.tsx
index 6c08e35b5..37cfe826f 100644
--- a/surfsense_web/app/desktop/permissions/page.tsx
+++ b/surfsense_web/app/desktop/permissions/page.tsx
@@ -1,7 +1,7 @@
 "use client";
 
-import { useEffect, useState } from "react";
 import { useRouter } from "next/navigation";
+import { useEffect, useState } from "react";
 import { Logo } from "@/components/Logo";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
@@ -17,7 +17,8 @@ const STEPS = [
 	{
 		id: "screen-recording",
 		title: "Screen Recording",
-		description: "Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
+		description:
+			"Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
 		action: "requestScreenRecording",
 		field: "screenRecording" as const,
 	},
@@ -79,7 +80,9 @@ export default function DesktopPermissionsPage() {
 
 		poll();
 		interval = setInterval(poll, 2000);
-		return () => { if (interval) clearInterval(interval); };
+		return () => {
+			if (interval) clearInterval(interval);
+		};
 	}, []);
 
 	if (!isElectron) {
@@ -98,7 +101,8 @@ export default function DesktopPermissionsPage() {
 		);
 	}
 
-	const allGranted = permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
+	const allGranted =
+		permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
 
 	const handleRequest = async (action: string) => {
 		if (action === "requestScreenRecording") {
@@ -175,7 +179,8 @@ export default function DesktopPermissionsPage() {
 											</p>
 										)}
 										<p className="text-xs text-muted-foreground">
-											If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and select it from Applications.
+											If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and
+											select it from Applications.
 										</p>
 									</div>
 								)}
diff --git a/surfsense_web/app/desktop/suggestion/layout.tsx b/surfsense_web/app/desktop/suggestion/layout.tsx
index 36b7e037b..fd8faf099 100644
--- a/surfsense_web/app/desktop/suggestion/layout.tsx
+++ b/surfsense_web/app/desktop/suggestion/layout.tsx
@@ -4,10 +4,6 @@ export const metadata = {
 	title: "SurfSense Suggestion",
 };
 
-export default function SuggestionLayout({
-	children,
-}: {
-	children: React.ReactNode;
-}) {
+export default function SuggestionLayout({ children }: { children: React.ReactNode }) {
 	return <div className="suggestion-body">{children}</div>;
 }
diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx
index 03944867f..6ade64883 100644
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@@ -72,27 +72,23 @@ export default function SuggestionPage() {
 				return;
 			}
 
-			const backendUrl =
-				process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
+			const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
 
 			try {
-				const response = await fetch(
-					`${backendUrl}/api/v1/autocomplete/vision/stream`,
-					{
-						method: "POST",
-						headers: {
-							Authorization: `Bearer ${token}`,
-							"Content-Type": "application/json",
-						},
-						body: JSON.stringify({
-							screenshot,
-							search_space_id: parseInt(searchSpaceId, 10),
-							app_name: appName || "",
-							window_title: windowTitle || "",
-						}),
-						signal: controller.signal,
+				const response = await fetch(`${backendUrl}/api/v1/autocomplete/vision/stream`, {
+					method: "POST",
+					headers: {
+						Authorization: `Bearer ${token}`,
+						"Content-Type": "application/json",
 					},
-				);
+					body: JSON.stringify({
+						screenshot,
+						search_space_id: parseInt(searchSpaceId, 10),
+						app_name: appName || "",
+						window_title: windowTitle || "",
+					}),
+					signal: controller.signal,
+				});
 
 				if (!response.ok) {
 					setError(friendlyError(response.status));
@@ -132,9 +128,7 @@ export default function SuggestionPage() {
 								} else if (parsed.type === "error") {
 									setError(friendlyError(parsed.errorText));
 								}
-							} catch {
-								continue;
-							}
+							} catch {}
 						}
 					}
 				}
@@ -145,7 +139,7 @@ export default function SuggestionPage() {
 				setIsLoading(false);
 			}
 		},
-		[],
+		[]
 	);
 
 	useEffect(() => {
@@ -207,10 +201,18 @@ export default function SuggestionPage() {
 		<div className="suggestion-tooltip">
 			<p className="suggestion-text">{suggestion}</p>
 			<div className="suggestion-actions">
-				<button className="suggestion-btn suggestion-btn-accept" onClick={handleAccept}>
+				<button
+					type="button"
+					className="suggestion-btn suggestion-btn-accept"
+					onClick={handleAccept}
+				>
 					Accept
 				</button>
-				<button className="suggestion-btn suggestion-btn-dismiss" onClick={handleDismiss}>
+				<button
+					type="button"
+					className="suggestion-btn suggestion-btn-dismiss"
+					onClick={handleDismiss}
+				>
 					Dismiss
 				</button>
 			</div>
diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css
index 62f4d2ea7..ef578059c 100644
--- a/surfsense_web/app/desktop/suggestion/suggestion.css
+++ b/surfsense_web/app/desktop/suggestion/suggestion.css
@@ -1,121 +1,125 @@
 html:has(.suggestion-body),
 body:has(.suggestion-body) {
-  margin: 0 !important;
-  padding: 0 !important;
-  background: transparent !important;
-  overflow: hidden !important;
-  height: auto !important;
-  width: 100% !important;
+	margin: 0 !important;
+	padding: 0 !important;
+	background: transparent !important;
+	overflow: hidden !important;
+	height: auto !important;
+	width: 100% !important;
 }
 
 .suggestion-body {
-  margin: 0;
-  padding: 0;
-  background: transparent;
-  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-  -webkit-font-smoothing: antialiased;
-  user-select: none;
-  -webkit-app-region: no-drag;
+	margin: 0;
+	padding: 0;
+	background: transparent;
+	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+	-webkit-font-smoothing: antialiased;
+	user-select: none;
+	-webkit-app-region: no-drag;
 }
 
 .suggestion-tooltip {
-  background: #1e1e1e;
-  border: 1px solid #3c3c3c;
-  border-radius: 8px;
-  padding: 8px 12px;
-  margin: 4px;
-  max-width: 400px;
-  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5);
+	background: #1e1e1e;
+	border: 1px solid #3c3c3c;
+	border-radius: 8px;
+	padding: 8px 12px;
+	margin: 4px;
+	max-width: 400px;
+	box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5);
 }
 
 .suggestion-text {
-  color: #d4d4d4;
-  font-size: 13px;
-  line-height: 1.45;
-  margin: 0 0 6px 0;
-  word-wrap: break-word;
-  white-space: pre-wrap;
+	color: #d4d4d4;
+	font-size: 13px;
+	line-height: 1.45;
+	margin: 0 0 6px 0;
+	word-wrap: break-word;
+	white-space: pre-wrap;
 }
 
 .suggestion-actions {
-  display: flex;
-  justify-content: flex-end;
-  gap: 4px;
-  border-top: 1px solid #2a2a2a;
-  padding-top: 6px;
+	display: flex;
+	justify-content: flex-end;
+	gap: 4px;
+	border-top: 1px solid #2a2a2a;
+	padding-top: 6px;
 }
 
 .suggestion-btn {
-  padding: 2px 8px;
-  border-radius: 3px;
-  border: 1px solid #3c3c3c;
-  font-family: inherit;
-  font-size: 10px;
-  font-weight: 500;
-  cursor: pointer;
-  line-height: 16px;
-  transition: background 0.15s, border-color 0.15s;
+	padding: 2px 8px;
+	border-radius: 3px;
+	border: 1px solid #3c3c3c;
+	font-family: inherit;
+	font-size: 10px;
+	font-weight: 500;
+	cursor: pointer;
+	line-height: 16px;
+	transition:
+		background 0.15s,
+		border-color 0.15s;
 }
 
 .suggestion-btn-accept {
-  background: #2563eb;
-  border-color: #3b82f6;
-  color: #fff;
+	background: #2563eb;
+	border-color: #3b82f6;
+	color: #fff;
 }
 
 .suggestion-btn-accept:hover {
-  background: #1d4ed8;
+	background: #1d4ed8;
 }
 
 .suggestion-btn-dismiss {
-  background: #2a2a2a;
-  color: #999;
+	background: #2a2a2a;
+	color: #999;
 }
 
 .suggestion-btn-dismiss:hover {
-  background: #333;
-  color: #ccc;
+	background: #333;
+	color: #ccc;
 }
 
 .suggestion-error {
-  border-color: #5c2626;
+	border-color: #5c2626;
 }
 
 .suggestion-error-text {
-  color: #f48771;
-  font-size: 12px;
+	color: #f48771;
+	font-size: 12px;
 }
 
 .suggestion-loading {
-  display: flex;
-  gap: 5px;
-  padding: 2px 0;
-  justify-content: center;
+	display: flex;
+	gap: 5px;
+	padding: 2px 0;
+	justify-content: center;
 }
 
 .suggestion-dot {
-  width: 4px;
-  height: 4px;
-  border-radius: 50%;
-  background: #666;
-  animation: suggestion-pulse 1.2s infinite ease-in-out;
+	width: 4px;
+	height: 4px;
+	border-radius: 50%;
+	background: #666;
+	animation: suggestion-pulse 1.2s infinite ease-in-out;
 }
 
 .suggestion-dot:nth-child(2) {
-  animation-delay: 0.15s;
+	animation-delay: 0.15s;
 }
 
 .suggestion-dot:nth-child(3) {
-  animation-delay: 0.3s;
+	animation-delay: 0.3s;
 }
 
 @keyframes suggestion-pulse {
-  0%, 80%, 100% {
-    opacity: 0.3;
-    transform: scale(0.8);
-  }
-  40% {
-    opacity: 1;
-    transform: scale(1.1);
-  }
+	0%,
+	80%,
+	100% {
+		opacity: 0.3;
+		transform: scale(0.8);
+	}
+	40% {
+		opacity: 1;
+		transform: scale(1.1);
+	}
 }
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
index 5e2b8452b..b4c049c5c 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
@@ -173,9 +173,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 								<Plus className="size-3 text-primary" />
 							)}
 						</div>
-						<span className="text-xs sm:text-sm font-medium">
-							{buttonText}
-						</span>
+						<span className="text-xs sm:text-sm font-medium">{buttonText}</span>
 					</button>
 				</div>
 			</div>
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
index c16072bca..8982b16a8 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
@@ -337,9 +337,7 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 					disabled={isSubmitting || isFetchingPlaylist || videoTags.length === 0}
 					className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
 				>
-					<span className={isSubmitting ? "opacity-0" : ""}>
-						{t("submit")}
-					</span>
+					<span className={isSubmitting ? "opacity-0" : ""}>{t("submit")}</span>
 					{isSubmitting && <Spinner size="sm" className="absolute" />}
 				</Button>
 			</div>
diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
index 7b0409345..0b38979a5 100644
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@@ -132,9 +132,7 @@ const DocumentUploadPopupContent: FC<{
 				<div className="flex-1 min-h-0 overflow-y-auto overscroll-contain">
 					<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
 						<div className="flex items-center gap-2 mb-1 pr-8 sm:pr-0">
-							<h2 className="text-xl sm:text-3xl font-semibold tracking-tight">
-								Upload Documents
-							</h2>
+							<h2 className="text-xl sm:text-3xl font-semibold tracking-tight">Upload Documents</h2>
 						</div>
 						<p className="text-xs sm:text-base text-muted-foreground/80 line-clamp-1">
 							Upload and sync your documents to your search space
diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx
index c147eede4..59781abcf 100644
--- a/surfsense_web/components/assistant-ui/image.tsx
+++ b/surfsense_web/components/assistant-ui/image.tsx
@@ -3,10 +3,10 @@
 import type { ImageMessagePartComponent } from "@assistant-ui/react";
 import { cva, type VariantProps } from "class-variance-authority";
 import { ImageIcon, ImageOffIcon } from "lucide-react";
+import NextImage from "next/image";
 import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react";
 import { createPortal } from "react-dom";
 import { cn } from "@/lib/utils";
-import NextImage from 'next/image';
 
 const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", {
 	variants: {
@@ -88,23 +88,23 @@ function ImagePreview({
 					<ImageOffIcon className="size-8 text-muted-foreground" />
 				</div>
 			) : isDataOrBlobUrl(src) ? (
-                // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
-                <img
-                    ref={imgRef}
-                    src={src}
-                    alt={alt}
-                    className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
-                    onLoad={(e) => {
-                        if (typeof src === "string") setLoadedSrc(src);
-                        onLoad?.(e);
-                    }}
-                    onError={(e) => {
-                        if (typeof src === "string") setErrorSrc(src);
-                        onError?.(e);
-                    }}
-                    {...props}
-                />
-            ) : (
+				// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
+				<img
+					ref={imgRef}
+					src={src}
+					alt={alt}
+					className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
+					onLoad={(e) => {
+						if (typeof src === "string") setLoadedSrc(src);
+						onLoad?.(e);
+					}}
+					onError={(e) => {
+						if (typeof src === "string") setErrorSrc(src);
+						onError?.(e);
+					}}
+					{...props}
+				/>
+			) : (
 				// biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs
 				// <img
 				// 	ref={imgRef}
@@ -122,22 +122,22 @@ function ImagePreview({
 				// 	{...props}
 				// />
 				<NextImage
-				fill
-				src={src || ""}
-				alt={alt}
-				sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw"
-				className={cn("block object-contain", !loaded && "invisible", className)}
-				onLoad={() => {
-					if (typeof src === "string") setLoadedSrc(src);
-					onLoad?.();
-				}}
-				onError={() => {
-					if (typeof src === "string") setErrorSrc(src);
-					onError?.();
-				}}
-				unoptimized={false}
-				{...props}
-			/>
+					fill
+					src={src || ""}
+					alt={alt}
+					sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw"
+					className={cn("block object-contain", !loaded && "invisible", className)}
+					onLoad={() => {
+						if (typeof src === "string") setLoadedSrc(src);
+						onLoad?.();
+					}}
+					onError={() => {
+						if (typeof src === "string") setErrorSrc(src);
+						onError?.();
+					}}
+					unoptimized={false}
+					{...props}
+				/>
 			)}
 		</div>
 	);
@@ -162,8 +162,8 @@ type ImageZoomProps = PropsWithChildren<{
 	alt?: string;
 }>;
 function isDataOrBlobUrl(src: string | undefined): boolean {
-    if (!src || typeof src !== "string") return false;
-    return src.startsWith("data:") || src.startsWith("blob:");
+	if (!src || typeof src !== "string") return false;
+	return src.startsWith("data:") || src.startsWith("blob:");
 }
 function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) {
 	const [isMounted, setIsMounted] = useState(false);
@@ -216,38 +216,38 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) {
 					>
 						{/** biome-ignore lint/performance/noImgElement: <explanation> */}
 						{isDataOrBlobUrl(src) ? (
-                            // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
-                            <img
-                                data-slot="image-zoom-content"
-                                src={src}
-                                alt={alt}
-                                className="aui-image-zoom-content fade-in zoom-in-95 max-h-[90vh] max-w-[90vw] animate-in object-contain duration-200"
-                                onClick={(e) => {
-                                    e.stopPropagation();
-                                    handleClose();
-                                }}
-                                onKeyDown={(e) => {
-                                    if (e.key === "Enter") {
-                                        e.stopPropagation();
-                                        handleClose();
-                                    }
-                                }}
-                            />
-                        ) : (
+							// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
+							<img
+								data-slot="image-zoom-content"
+								src={src}
+								alt={alt}
+								className="aui-image-zoom-content fade-in zoom-in-95 max-h-[90vh] max-w-[90vw] animate-in object-contain duration-200"
+								onClick={(e) => {
+									e.stopPropagation();
+									handleClose();
+								}}
+								onKeyDown={(e) => {
+									if (e.key === "Enter") {
+										e.stopPropagation();
+										handleClose();
+									}
+								}}
+							/>
+						) : (
 							<NextImage
-                                data-slot="image-zoom-content"
-                                fill
-                                src={src}
-                                alt={alt}
-                                sizes="90vw"
-                                className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
-                                onClick={(e) => {
-                                    e.stopPropagation();
-                                    handleClose();
-                                }}
-                                unoptimized={false}
-                            />
-                        )}
+								data-slot="image-zoom-content"
+								fill
+								src={src}
+								alt={alt}
+								sizes="90vw"
+								className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
+								onClick={(e) => {
+									e.stopPropagation();
+									handleClose();
+								}}
+								unoptimized={false}
+							/>
+						)}
 					</button>,
 					document.body
 				)}
diff --git a/surfsense_web/components/assistant-ui/thread-list.tsx b/surfsense_web/components/assistant-ui/thread-list.tsx
index e8b8db6fe..bca36c037 100644
--- a/surfsense_web/components/assistant-ui/thread-list.tsx
+++ b/surfsense_web/components/assistant-ui/thread-list.tsx
@@ -241,9 +241,7 @@ const ThreadListItemComponent = memo(function ThreadListItemComponent({
 			<MessageSquareIcon className="size-4 shrink-0 text-muted-foreground" />
 			<div className="flex-1 min-w-0">
 				<p className="truncate text-sm font-medium">{thread.title || "New Chat"}</p>
-				<p className="truncate text-xs text-muted-foreground">
-					{relativeTime}
-				</p>
+				<p className="truncate text-xs text-muted-foreground">{relativeTime}</p>
 			</div>
 			<DropdownMenu>
 				<DropdownMenuTrigger asChild>
diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx
index 40118d2e4..b658dba6d 100644
--- a/surfsense_web/components/assistant-ui/tool-fallback.tsx
+++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx
@@ -26,7 +26,8 @@ export const ToolFallback: ToolCallMessagePartComponent = ({
 	);
 
 	const serializedResult = useMemo(
-		() => (result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null),
+		() =>
+			result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null,
 		[result]
 	);
 
diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
index e14022f5c..1c4383388 100644
--- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
+++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
@@ -300,15 +300,15 @@ export function CommentComposer({
 
 			<div className={cn("flex items-center gap-2", !compact && "justify-end")}>
 				{onCancel && (
-				<Button
-					type="button"
-					variant="ghost"
-					size="sm"
-					onClick={onCancel}
-					disabled={isSubmitting}
-				>
-					Cancel
-				</Button>
+					<Button
+						type="button"
+						variant="ghost"
+						size="sm"
+						onClick={onCancel}
+						disabled={isSubmitting}
+					>
+						Cancel
+					</Button>
 				)}
 				<Button
 					type="button"
@@ -317,11 +317,7 @@ export function CommentComposer({
 					disabled={!canSubmit}
 					className={cn(!canSubmit && "opacity-50", compact && "size-8 shrink-0 rounded-full")}
 				>
-					{compact ? (
-						<ArrowUp className="size-4" />
-					) : (
-						submitLabel
-					)}
+					{compact ? <ArrowUp className="size-4" /> : submitLabel}
 				</Button>
 			</div>
 		</div>
diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx
index fe796b5be..d8e37df1c 100644
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@@ -207,9 +207,15 @@ export const DocumentNode = React.memo(function DocumentNode({
 						);
 					})()}
 
-					<Tooltip delayDuration={600} open={titleTooltipOpen} onOpenChange={handleTitleTooltipOpenChange}>
+					<Tooltip
+						delayDuration={600}
+						open={titleTooltipOpen}
+						onOpenChange={handleTitleTooltipOpenChange}
+					>
 						<TooltipTrigger asChild>
-							<span ref={titleRef} className="flex-1 min-w-0 truncate">{doc.title}</span>
+							<span ref={titleRef} className="flex-1 min-w-0 truncate">
+								{doc.title}
+							</span>
 						</TooltipTrigger>
 						<TooltipContent side="bottom" className="max-w-xs break-words">
 							{doc.title}
@@ -276,10 +282,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 									Versions
 								</DropdownMenuItem>
 							)}
-							<DropdownMenuItem
-								disabled={isProcessing}
-								onClick={() => onDelete(doc)}
-							>
+							<DropdownMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
 								<Trash2 className="mr-2 h-4 w-4" />
 								Delete
 							</DropdownMenuItem>
@@ -321,10 +324,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 							Versions
 						</ContextMenuItem>
 					)}
-					<ContextMenuItem
-						disabled={isProcessing}
-						onClick={() => onDelete(doc)}
-					>
+					<ContextMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
 						<Trash2 className="mr-2 h-4 w-4" />
 						Delete
 					</ContextMenuItem>
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index 371d00f42..47cd17596 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -97,7 +97,10 @@ export function FolderTreeView({
 	const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]);
 
 	const effectiveActiveTypes = useMemo(() => {
-		if (activeTypes.includes("FILE" as DocumentTypeEnum) && !activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum)) {
+		if (
+			activeTypes.includes("FILE" as DocumentTypeEnum) &&
+			!activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum)
+		) {
 			return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum];
 		}
 		return activeTypes;
@@ -110,7 +113,9 @@ export function FolderTreeView({
 		function check(folderId: number): boolean {
 			if (match[folderId] !== undefined) return match[folderId];
 			const childDocs = (docsByFolder[folderId] ?? []).some(
-				(d) => effectiveActiveTypes.length === 0 || effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
+				(d) =>
+					effectiveActiveTypes.length === 0 ||
+					effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 			);
 			if (childDocs) {
 				match[folderId] = true;
@@ -201,7 +206,9 @@ export function FolderTreeView({
 			? childFolders.filter((f) => hasDescendantMatch[f.id])
 			: childFolders;
 		const childDocs = (docsByFolder[key] ?? []).filter(
-			(d) => effectiveActiveTypes.length === 0 || effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
+			(d) =>
+				effectiveActiveTypes.length === 0 ||
+				effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 		);
 
 		const nodes: React.ReactNode[] = [];
@@ -223,7 +230,7 @@ export function FolderTreeView({
 					depth={depth}
 					isExpanded={isExpanded}
 					isRenaming={renamingFolderId === f.id}
-				selectionState={folderSelectionStates[f.id] ?? "none"}
+					selectionState={folderSelectionStates[f.id] ?? "none"}
 					processingState={folderProcessingStates[f.id] ?? "idle"}
 					onToggleSelect={onToggleFolderSelect}
 					onToggleExpand={onToggleExpand}
diff --git a/surfsense_web/components/editor/plate-editor.tsx b/surfsense_web/components/editor/plate-editor.tsx
index ed239ffa3..66e9a0e5e 100644
--- a/surfsense_web/components/editor/plate-editor.tsx
+++ b/surfsense_web/components/editor/plate-editor.tsx
@@ -158,17 +158,18 @@ export function PlateEditor({
 	// When not forced read-only, the user can toggle between editing/viewing.
 	const canToggleMode = !readOnly;
 
-	const contextProviderValue = useMemo(()=> ({
-		onSave,
-		hasUnsavedChanges,
-		isSaving,
-		canToggleMode,
-	}), [onSave, hasUnsavedChanges, isSaving, canToggleMode]);
+	const contextProviderValue = useMemo(
+		() => ({
+			onSave,
+			hasUnsavedChanges,
+			isSaving,
+			canToggleMode,
+		}),
+		[onSave, hasUnsavedChanges, isSaving, canToggleMode]
+	);
 
 	return (
-		<EditorSaveContext.Provider
-			value={contextProviderValue}
-		>
+		<EditorSaveContext.Provider value={contextProviderValue}>
 			<Plate
 				editor={editor}
 				// Only pass readOnly as a controlled prop when forced (permanently read-only).
diff --git a/surfsense_web/components/homepage/use-cases-grid.tsx b/surfsense_web/components/homepage/use-cases-grid.tsx
index f9d315b49..7aa272a5c 100644
--- a/surfsense_web/components/homepage/use-cases-grid.tsx
+++ b/surfsense_web/components/homepage/use-cases-grid.tsx
@@ -1,7 +1,7 @@
 "use client";
-import Image from 'next/image';
 
 import { AnimatePresence, motion } from "motion/react";
+import Image from "next/image";
 import { ExpandedGifOverlay, useExpandedGif } from "@/components/ui/expanded-gif-overlay";
 
 const useCases = [
@@ -83,13 +83,13 @@ function UseCaseCard({
 						className="w-full rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
 					/>
 					<div className="relative w-full h-48">
-					<Image
-						src={src}
-						alt={title}
-						fill
-						className="rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
-						unoptimized={src.endsWith('.gif')}
-					/>
+						<Image
+							src={src}
+							alt={title}
+							fill
+							className="rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
+							unoptimized={src.endsWith(".gif")}
+						/>
 					</div>
 				</div>
 				<div className="px-5 py-4">
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
index 74c3c64de..7d4cd9901 100644
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@@ -370,7 +370,8 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 						url: "#announcements",
 						icon: Megaphone,
 						isActive: isAnnouncementsSidebarOpen,
-						badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
+						badge:
+							announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
 					},
 				] as (NavItem | null)[]
 			).filter((item): item is NavItem => item !== null),
diff --git a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
index 0e2163dd4..3459fccf6 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
@@ -376,24 +376,24 @@ export function AllPrivateChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-									<Tooltip delayDuration={600}>
-										<TooltipTrigger asChild>
-											<button
-												type="button"
-												onClick={() => handleThreadClick(thread.id)}
-												disabled={isBusy}
-												className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
-											>
-												<span className="truncate">{thread.title || "New Chat"}</span>
-											</button>
-										</TooltipTrigger>
-										<TooltipContent side="bottom" align="start">
-											<p>
-												{t("updated") || "Updated"}:{" "}
-												{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
-											</p>
-										</TooltipContent>
-									</Tooltip>
+										<Tooltip delayDuration={600}>
+											<TooltipTrigger asChild>
+												<button
+													type="button"
+													onClick={() => handleThreadClick(thread.id)}
+													disabled={isBusy}
+													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+												>
+													<span className="truncate">{thread.title || "New Chat"}</span>
+												</button>
+											</TooltipTrigger>
+											<TooltipContent side="bottom" align="start">
+												<p>
+													{t("updated") || "Updated"}:{" "}
+													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+												</p>
+											</TooltipContent>
+										</Tooltip>
 									)}
 
 									<DropdownMenu
diff --git a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
index 9cc1da1e4..097d10121 100644
--- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
@@ -375,24 +375,24 @@ export function AllSharedChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-									<Tooltip delayDuration={600}>
-										<TooltipTrigger asChild>
-											<button
-												type="button"
-												onClick={() => handleThreadClick(thread.id)}
-												disabled={isBusy}
-												className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
-											>
-												<span className="truncate">{thread.title || "New Chat"}</span>
-											</button>
-										</TooltipTrigger>
-										<TooltipContent side="bottom" align="start">
-											<p>
-												{t("updated") || "Updated"}:{" "}
-												{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
-											</p>
-										</TooltipContent>
-									</Tooltip>
+										<Tooltip delayDuration={600}>
+											<TooltipTrigger asChild>
+												<button
+													type="button"
+													onClick={() => handleThreadClick(thread.id)}
+													disabled={isBusy}
+													className="flex items-center gap-2 flex-1 min-w-0 text-left overflow-hidden"
+												>
+													<span className="truncate">{thread.title || "New Chat"}</span>
+												</button>
+											</TooltipTrigger>
+											<TooltipContent side="bottom" align="start">
+												<p>
+													{t("updated") || "Updated"}:{" "}
+													{format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")}
+												</p>
+											</TooltipContent>
+										</Tooltip>
 									)}
 
 									<DropdownMenu
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx
index 1c39f03a0..2acf68e94 100644
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@@ -2,9 +2,9 @@ import { createCodePlugin } from "@streamdown/code";
 import { createMathPlugin } from "@streamdown/math";
 import { Streamdown, type StreamdownProps } from "streamdown";
 import "katex/dist/katex.min.css";
-import { cn } from "@/lib/utils";
-import Image from 'next/image';
 import { is } from "drizzle-orm";
+import Image from "next/image";
+import { cn } from "@/lib/utils";
 
 const code = createCodePlugin({
 	themes: ["nord", "nord"],
@@ -130,30 +130,31 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
 		),
 		hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
 		img: ({ src, alt, width: _w, height: _h, ...props }) => {
-    	const isDataOrUnknownUrl = typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
+			const isDataOrUnknownUrl =
+				typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
 
-    return isDataOrUnknownUrl ? (
-        // eslint-disable-next-line @next/next/no-img-element
-        <img
-            className="max-w-full h-auto my-4 rounded"
-            alt={alt || "markdown image"}
-            src={src}
-            loading="lazy"
-            {...props}
-        />
-    ) : (
-        <Image
-            className="max-w-full h-auto my-4 rounded"
-            alt={alt || "markdown image"}
-            src={typeof src === "string" ? src : ""}
-            width={_w || 800}
-            height={_h || 600}
-            sizes="(max-width: 768px) 100vw, (max-width: 1200px) 75vw, 60vw"
-            unoptimized={isDataOrUnknownUrl}
-            {...props}
-        />
-    );
-},
+			return isDataOrUnknownUrl ? (
+				// eslint-disable-next-line @next/next/no-img-element
+				<img
+					className="max-w-full h-auto my-4 rounded"
+					alt={alt || "markdown image"}
+					src={src}
+					loading="lazy"
+					{...props}
+				/>
+			) : (
+				<Image
+					className="max-w-full h-auto my-4 rounded"
+					alt={alt || "markdown image"}
+					src={typeof src === "string" ? src : ""}
+					width={_w || 800}
+					height={_h || 600}
+					sizes="(max-width: 768px) 100vw, (max-width: 1200px) 75vw, 60vw"
+					unoptimized={isDataOrUnknownUrl}
+					{...props}
+				/>
+			);
+		},
 		table: ({ ...props }) => (
 			<div className="overflow-x-auto my-4 rounded-lg border border-border w-full">
 				<table className="w-full divide-y divide-border" {...props} />
diff --git a/surfsense_web/components/settings/user-settings-dialog.tsx b/surfsense_web/components/settings/user-settings-dialog.tsx
index b74ff973b..0afdfb2b7 100644
--- a/surfsense_web/components/settings/user-settings-dialog.tsx
+++ b/surfsense_web/components/settings/user-settings-dialog.tsx
@@ -5,10 +5,10 @@ import { Globe, KeyRound, Monitor, Receipt, Sparkles, User } from "lucide-react"
 import { useTranslations } from "next-intl";
 import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent";
 import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent";
+import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
 import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent";
 import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent";
 import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent";
-import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
 import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import { SettingsDialog } from "@/components/settings/settings-dialog";
 
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index c8ce195aa..36a24e299 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -471,13 +471,13 @@ export function DocumentUploadTab({
 						</button>
 					))
 				) : (
-				<button
-					type="button"
-					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
-					onClick={() => {
-						if (!isElectron) fileInputRef.current?.click();
-					}}
-				>
+					<button
+						type="button"
+						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
+						onClick={() => {
+							if (!isElectron) fileInputRef.current?.click();
+						}}
+					>
 						<Upload className="h-10 w-10 text-muted-foreground" />
 						<div className="text-center space-y-1.5">
 							<p className="text-base font-medium">
@@ -485,10 +485,15 @@ export function DocumentUploadTab({
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
-					{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
-					<div className="w-full mt-1" onClick={(e) => e.stopPropagation()} onKeyDown={(e) => e.stopPropagation()} role="group">
-						{renderBrowseButton({ fullWidth: true })}
-					</div>
+						{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
+						<div
+							className="w-full mt-1"
+							onClick={(e) => e.stopPropagation()}
+							onKeyDown={(e) => e.stopPropagation()}
+							role="group"
+						>
+							{renderBrowseButton({ fullWidth: true })}
+						</div>
 					</button>
 				)}
 			</div>
@@ -684,17 +689,17 @@ export function DocumentUploadTab({
 						</span>
 					</AccordionTrigger>
 					<AccordionContent className="px-3 pb-3">
-					<div className="flex flex-wrap gap-1.5">
-						{supportedExtensions.map((ext) => (
-							<Badge
-								key={ext}
-								variant="secondary"
-								className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
-							>
-								{ext}
-							</Badge>
-						))}
-					</div>
+						<div className="flex flex-wrap gap-1.5">
+							{supportedExtensions.map((ext) => (
+								<Badge
+									key={ext}
+									variant="secondary"
+									className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
+								>
+									{ext}
+								</Badge>
+							))}
+						</div>
 					</AccordionContent>
 				</AccordionItem>
 			</Accordion>
diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx
index 75b02bf3d..bbe869a09 100644
--- a/surfsense_web/components/tool-ui/citation/citation-list.tsx
+++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx
@@ -2,13 +2,12 @@
 
 import type { LucideIcon } from "lucide-react";
 import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
+import NextImage from "next/image";
 import * as React from "react";
 import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media";
 import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
 import { Citation } from "./citation";
 import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
-import NextImage from 'next/image';
-
 
 const TYPE_ICONS: Record<CitationType, LucideIcon> = {
 	webpage: Globe,
@@ -264,9 +263,9 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) {
 					className="size-4.5 rounded-full object-cover"
 					unoptimized={true}
 				/>
-				) : (
+			) : (
 				<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
-				)}
+			)}
 			<div className="min-w-0 flex-1">
 				<p className="group-hover:decoration-foreground/30 truncate text-sm font-medium group-hover:underline group-hover:underline-offset-2">
 					{citation.title}
@@ -341,18 +340,18 @@ function StackedCitations({ id, citations, className, onNavigate }: StackedCitat
 										style={{ zIndex: maxIcons - index }}
 									>
 										{citation.favicon ? (
-										<NextImage
-											src={citation.favicon}
-											alt=""
-											aria-hidden="true"
-											width={18}
-											height={18}
-											className="size-4.5 rounded-full object-cover"
-											unoptimized={true}
-										/>
-									) : (
-										<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
-									)}	
+											<NextImage
+												src={citation.favicon}
+												alt=""
+												aria-hidden="true"
+												width={18}
+												height={18}
+												className="size-4.5 rounded-full object-cover"
+												unoptimized={true}
+											/>
+										) : (
+											<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
+										)}
 									</div>
 								);
 							})}
diff --git a/surfsense_web/components/tool-ui/citation/citation.tsx b/surfsense_web/components/tool-ui/citation/citation.tsx
index fa5d4d165..c60034a0a 100644
--- a/surfsense_web/components/tool-ui/citation/citation.tsx
+++ b/surfsense_web/components/tool-ui/citation/citation.tsx
@@ -2,11 +2,11 @@
 
 import type { LucideIcon } from "lucide-react";
 import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
+import NextImage from "next/image";
 import * as React from "react";
 import { openSafeNavigationHref, sanitizeHref } from "../shared/media";
 import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
 import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
-import NextImage from 'next/image';
 
 const FALLBACK_LOCALE = "en-US";
 
@@ -115,18 +115,18 @@ export function Citation(props: CitationProps) {
 	};
 
 	const iconElement = favicon ? (
-    <NextImage
-        src={favicon}
-        alt=""
-        aria-hidden="true"
-        width={16}
-        height={16}
-        className="bg-muted size-3.5 shrink-0 rounded object-cover"
-        unoptimized={true}
-    />
-) : (
-    <TypeIcon className="size-3.5 shrink-0 opacity-60" aria-hidden="true" />
-);
+		<NextImage
+			src={favicon}
+			alt=""
+			aria-hidden="true"
+			width={16}
+			height={16}
+			className="bg-muted size-3.5 shrink-0 rounded object-cover"
+			unoptimized={true}
+		/>
+	) : (
+		<TypeIcon className="size-3.5 shrink-0 opacity-60" aria-hidden="true" />
+	);
 
 	const { open, handleMouseEnter, handleMouseLeave } = useHoverPopover();
 
diff --git a/surfsense_web/components/ui/animated-tabs.tsx b/surfsense_web/components/ui/animated-tabs.tsx
index 0cd8dd54c..f26e5af8d 100644
--- a/surfsense_web/components/ui/animated-tabs.tsx
+++ b/surfsense_web/components/ui/animated-tabs.tsx
@@ -202,7 +202,10 @@ const Tabs = forwardRef<
 		},
 		[onValueChange, value]
 	);
-	const contextValue = useMemo(() => ({ activeValue, onValueChange: handleValueChange }), [activeValue, handleValueChange]);
+	const contextValue = useMemo(
+		() => ({ activeValue, onValueChange: handleValueChange }),
+		[activeValue, handleValueChange]
+	);
 	return (
 		<TabsContext.Provider value={contextValue}>
 			<div ref={ref} className={cn("tabs-container", className)} {...props}>
diff --git a/surfsense_web/components/ui/toggle-group.tsx b/surfsense_web/components/ui/toggle-group.tsx
index 860fa7a52..44a94234a 100644
--- a/surfsense_web/components/ui/toggle-group.tsx
+++ b/surfsense_web/components/ui/toggle-group.tsx
@@ -3,9 +3,9 @@
 import * as ToggleGroupPrimitive from "@radix-ui/react-toggle-group";
 import type { VariantProps } from "class-variance-authority";
 import * as React from "react";
+import { useMemo } from "react";
 import { toggleVariants } from "@/components/ui/toggle";
 import { cn } from "@/lib/utils";
-import { useMemo } from "react";
 
 const ToggleGroupContext = React.createContext<
 	VariantProps<typeof toggleVariants> & {
@@ -28,8 +28,8 @@ function ToggleGroup({
 	VariantProps<typeof toggleVariants> & {
 		spacing?: number;
 	}) {
-	const contextValue = useMemo(() => ({variant, size, spacing }), [variant, size, spacing]);
-	
+	const contextValue = useMemo(() => ({ variant, size, spacing }), [variant, size, spacing]);
+
 	return (
 		<ToggleGroupPrimitive.Root
 			data-slot="toggle-group"
@@ -43,9 +43,7 @@ function ToggleGroup({
 			)}
 			{...props}
 		>
-			<ToggleGroupContext.Provider value={contextValue}>
-				{children}
-			</ToggleGroupContext.Provider>
+			<ToggleGroupContext.Provider value={contextValue}>{children}</ToggleGroupContext.Provider>
 		</ToggleGroupPrimitive.Root>
 	);
 }
diff --git a/surfsense_web/contexts/LocaleContext.tsx b/surfsense_web/contexts/LocaleContext.tsx
index e67e9c2a5..484721cbc 100644
--- a/surfsense_web/contexts/LocaleContext.tsx
+++ b/surfsense_web/contexts/LocaleContext.tsx
@@ -2,12 +2,12 @@
 
 import type React from "react";
 import { createContext, useCallback, useContext, useEffect, useMemo, useState } from "react";
+import { set } from "zod";
 import enMessages from "../messages/en.json";
 import esMessages from "../messages/es.json";
 import hiMessages from "../messages/hi.json";
 import ptMessages from "../messages/pt.json";
 import zhMessages from "../messages/zh.json";
-import { set } from "zod";
 
 type Locale = "en" | "es" | "pt" | "hi" | "zh";
 
@@ -66,13 +66,12 @@ export function LocaleProvider({ children }: { children: React.ReactNode }) {
 		}
 	}, [locale, mounted]);
 
-	const contextValue = useMemo(() => ({ locale, messages, setLocale }), [locale, messages, setLocale]);
-
-	return (
-		<LocaleContext.Provider value={contextValue}>
-			{children}
-		</LocaleContext.Provider>
+	const contextValue = useMemo(
+		() => ({ locale, messages, setLocale }),
+		[locale, messages, setLocale]
 	);
+
+	return <LocaleContext.Provider value={contextValue}>{children}</LocaleContext.Provider>;
 }
 
 export function useLocaleContext() {
diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts
index 6feb75463..a6959c32d 100644
--- a/surfsense_web/types/window.d.ts
+++ b/surfsense_web/types/window.d.ts
@@ -50,14 +50,21 @@ interface ElectronAPI {
 	replaceText: (text: string) => Promise<void>;
 	// Permissions
 	getPermissionsStatus: () => Promise<{
-		accessibility: 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited';
-		screenRecording: 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited';
+		accessibility: "authorized" | "denied" | "not determined" | "restricted" | "limited";
+		screenRecording: "authorized" | "denied" | "not determined" | "restricted" | "limited";
 	}>;
 	requestAccessibility: () => Promise<void>;
 	requestScreenRecording: () => Promise<void>;
 	restartApp: () => Promise<void>;
 	// Autocomplete
-	onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void;
+	onAutocompleteContext: (
+		callback: (data: {
+			screenshot: string;
+			searchSpaceId?: string;
+			appName?: string;
+			windowTitle?: string;
+		}) => void
+	) => () => void;
 	acceptSuggestion: (text: string) => Promise<void>;
 	dismissSuggestion: () => Promise<void>;
 	setAutocompleteEnabled: (enabled: boolean) => Promise<void>;

From 8d810467dd95a2cb7e34fa83324dc07d35801a04 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:57:13 +0530
Subject: [PATCH 37/37] refactor: add support for XHTML file conversion to
 markdown in document processors

---
 .../app/tasks/document_processors/_direct_converters.py      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
index b1a69ef4f..bbff4838e 100644
--- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -4,8 +4,8 @@ Lossless file-to-markdown converters for text-based formats.
 These converters handle file types that can be faithfully represented as
 markdown without any external ETL/OCR service:
 
-- CSV / TSV  → markdown table  (stdlib ``csv``)
-- HTML / HTM → markdown        (``markdownify``)
+- CSV / TSV          → markdown table  (stdlib ``csv``)
+- HTML / HTM / XHTML → markdown        (``markdownify``)
 """
 
 from __future__ import annotations
@@ -73,6 +73,7 @@ _CONVERTER_MAP: dict[str, Callable[..., str]] = {
     ".tsv": tsv_to_markdown,
     ".html": html_to_markdown,
     ".htm": html_to_markdown,
+    ".xhtml": html_to_markdown,
 }