Merge commit '056fc0e7ff' into dev_mod

2026-04-30 03:16:25 +02:00 · 2026-04-07 02:56:46 -07:00 · 2026-04-07 02:56:46 -07:00 · 82b5c7f19e
commit 82b5c7f19e
parent bb1dcd32b6 056fc0e7ff
111 changed files with 4056 additions and 2219 deletions
--- a/surfsense_backend/app/connectors/dropbox/client.py
+++ b/surfsense_backend/app/connectors/dropbox/client.py
@ -225,6 +225,55 @@ class DropboxClient:
        return all_items, None
    async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]:
        """Get a cursor representing the current state of a folder.
        Uses /2/files/list_folder/get_latest_cursor so we can later call
        get_changes to receive only incremental updates.
        """
        resp = await self._request(
            "/2/files/list_folder/get_latest_cursor",
            {"path": path, "recursive": False, "include_non_downloadable_files": True},
        )
        if resp.status_code != 200:
            return None, f"Failed to get cursor: {resp.status_code} - {resp.text}"
        return resp.json().get("cursor"), None
    async def get_changes(
        self, cursor: str
    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
        """Fetch incremental changes since the given cursor.
        Calls /2/files/list_folder/continue and handles pagination.
        Returns (entries, new_cursor, error).
        """
        all_entries: list[dict[str, Any]] = []
        resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor})
        if resp.status_code == 401:
            return [], None, "Dropbox authentication expired (401)"
        if resp.status_code != 200:
            return [], None, f"Failed to get changes: {resp.status_code} - {resp.text}"
        data = resp.json()
        all_entries.extend(data.get("entries", []))
        while data.get("has_more"):
            cursor = data["cursor"]
            resp = await self._request(
                "/2/files/list_folder/continue", {"cursor": cursor}
            )
            if resp.status_code != 200:
                return (
                    all_entries,
                    data.get("cursor"),
                    f"Pagination failed: {resp.status_code}",
                )
            data = resp.json()
            all_entries.extend(data.get("entries", []))
        return all_entries, data.get("cursor"), None
    async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]:
        resp = await self._request("/2/files/get_metadata", {"path": path})
        if resp.status_code != 200:
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@ -53,7 +53,8 @@ async def download_and_extract_content(
    file_name = file.get("name", "Unknown")
    file_id = file.get("id", "")
-    if should_skip_file(file):
+    skip, _unsup_ext = should_skip_file(file)
    if skip:
        return None, {}, "Skipping non-indexable item"
    logger.info(f"Downloading file for content extraction: {file_name}")
@ -87,9 +88,13 @@ async def download_and_extract_content(
        if error:
            return None, metadata, error
-        from app.connectors.onedrive.content_extractor import _parse_file_to_markdown
+        from app.etl_pipeline.etl_document import EtlRequest
        from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        result = await EtlPipelineService().extract(
            EtlRequest(file_path=temp_file_path, filename=file_name)
        )
        markdown = result.markdown_content
        return markdown, metadata, None
    except Exception as e:
--- a/surfsense_backend/app/connectors/dropbox/file_types.py
+++ b/surfsense_backend/app/connectors/dropbox/file_types.py
@ -1,8 +1,8 @@
 """File type handlers for Dropbox."""
-PAPER_EXTENSION = ".paper"
+from app.etl_pipeline.file_classifier import should_skip_for_service
-SKIP_EXTENSIONS: frozenset[str] = frozenset()
+PAPER_EXTENSION = ".paper"
 MIME_TO_EXTENSION: dict[str, str] = {
    "application/pdf": ".pdf",
@ -42,17 +42,25 @@ def is_paper_file(item: dict) -> bool:
    return ext == PAPER_EXTENSION
-def should_skip_file(item: dict) -> bool:
+def should_skip_file(item: dict) -> tuple[bool, str | None]:
    """Skip folders and truly non-indexable files.
    Paper docs are non-downloadable but exportable, so they are NOT skipped.
    Returns (should_skip, unsupported_extension_or_None).
    """
    if is_folder(item):
-        return True
+        return True, None
    if is_paper_file(item):
-        return False
+        return False, None
    if not item.get("is_downloadable", True):
-        return True
+        return True, None
    from pathlib import PurePosixPath
    from app.config import config as app_config
    name = item.get("name", "")
-    ext = get_extension_from_name(name).lower()
+    if should_skip_for_service(name, app_config.ETL_SERVICE):
-    return ext in SKIP_EXTENSIONS
+        ext = PurePosixPath(name).suffix.lower()
        return True, ext
    return False, None
--- a/surfsense_backend/app/connectors/dropbox/folder_manager.py
+++ b/surfsense_backend/app/connectors/dropbox/folder_manager.py
@ -64,7 +64,9 @@ async def get_files_in_folder(
                        )
                        continue
                    files.extend(sub_files)
-            elif not should_skip_file(item):
+            else:
                skip, _unsup_ext = should_skip_file(item)
                if not skip:
                    files.append(item)
        return files, None
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -1,12 +1,9 @@
 """Content extraction for Google Drive files."""
 import asyncio
 import contextlib
 import logging
 import os
 import tempfile
 import threading
 import time
 from pathlib import Path
 from typing import Any
@ -20,6 +17,7 @@ from .file_types import (
    get_export_mime_type,
    get_extension_from_mime,
    is_google_workspace_file,
    should_skip_by_extension,
    should_skip_file,
 )
@ -45,6 +43,11 @@ async def download_and_extract_content(
    if should_skip_file(mime_type):
        return None, {}, f"Skipping {mime_type}"
    if not is_google_workspace_file(mime_type):
        ext_skip, _unsup_ext = should_skip_by_extension(file_name)
        if ext_skip:
            return None, {}, f"Skipping unsupported extension: {file_name}"
    logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
    drive_metadata: dict[str, Any] = {
@ -97,7 +100,10 @@ async def download_and_extract_content(
            if error:
                return None, drive_metadata, error
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        etl_filename = (
            file_name + extension if is_google_workspace_file(mime_type) else file_name
        )
        markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
        return markdown, drive_metadata, None
    except Exception as e:
@ -110,99 +116,14 @@ async def download_and_extract_content(
 async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
-    """Parse a local file to markdown using the configured ETL service."""
+    """Parse a local file to markdown using the unified ETL pipeline."""
-    lower = filename.lower()
+    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    if lower.endswith((".md", ".markdown", ".txt")):
+    result = await EtlPipelineService().extract(
-        with open(file_path, encoding="utf-8") as f:
+        EtlRequest(file_path=file_path, filename=filename)
            return f.read()
    if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
        from litellm import atranscription
        from app.config import config as app_config
        stt_service_type = (
            "local"
            if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
            else "external"
    )
-        if stt_service_type == "local":
+    return result.markdown_content
            from app.services.stt_service import stt_service
            t0 = time.monotonic()
            logger.info(
                f"[local-stt] START file={filename} thread={threading.current_thread().name}"
            )
            result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
            logger.info(
                f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
            )
            text = result.get("text", "")
        else:
            with open(file_path, "rb") as audio_file:
                kwargs: dict[str, Any] = {
                    "model": app_config.STT_SERVICE,
                    "file": audio_file,
                    "api_key": app_config.STT_SERVICE_API_KEY,
                }
                if app_config.STT_SERVICE_API_BASE:
                    kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
                resp = await atranscription(**kwargs)
                text = resp.get("text", "")
        if not text:
            raise ValueError("Transcription returned empty text")
        return f"# Transcription of {filename}\n\n{text}"
    # Document files -- use configured ETL service
    from app.config import config as app_config
    if app_config.ETL_SERVICE == "UNSTRUCTURED":
        from langchain_unstructured import UnstructuredLoader
        from app.utils.document_converters import convert_document_to_markdown
        loader = UnstructuredLoader(
            file_path,
            mode="elements",
            post_processors=[],
            languages=["eng"],
            include_orig_elements=False,
            include_metadata=False,
            strategy="auto",
        )
        docs = await loader.aload()
        return await convert_document_to_markdown(docs)
    if app_config.ETL_SERVICE == "LLAMACLOUD":
        from app.tasks.document_processors.file_processors import (
            parse_with_llamacloud_retry,
        )
        result = await parse_with_llamacloud_retry(
            file_path=file_path, estimated_pages=50
        )
        markdown_documents = await result.aget_markdown_documents(split_by_page=False)
        if not markdown_documents:
            raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
        return markdown_documents[0].text
    if app_config.ETL_SERVICE == "DOCLING":
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        t0 = time.monotonic()
        logger.info(
            f"[docling] START file={filename} thread={threading.current_thread().name}"
        )
        result = await asyncio.to_thread(converter.convert, file_path)
        logger.info(
            f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
        )
        return result.document.export_to_markdown()
    raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
 async def download_and_process_file(
@ -236,10 +157,14 @@ async def download_and_process_file(
    file_name = file.get("name", "Unknown")
    mime_type = file.get("mimeType", "")
    # Skip folders and shortcuts
    if should_skip_file(mime_type):
        return None, f"Skipping {mime_type}", None
    if not is_google_workspace_file(mime_type):
        ext_skip, _unsup_ext = should_skip_by_extension(file_name)
        if ext_skip:
            return None, f"Skipping unsupported extension: {file_name}", None
    logger.info(f"Downloading file: {file_name} ({mime_type})")
    temp_file_path = None
@ -310,10 +235,13 @@ async def download_and_process_file(
                "."
            )[-1]
        etl_filename = (
            file_name + extension if is_google_workspace_file(mime_type) else file_name
        )
        logger.info(f"Processing {file_name} with Surfsense's file processor")
        await process_file_in_background(
            file_path=temp_file_path,
-            filename=file_name,
+            filename=etl_filename,
            search_space_id=search_space_id,
            user_id=user_id,
            session=session,
--- a/surfsense_backend/app/connectors/google_drive/file_types.py
+++ b/surfsense_backend/app/connectors/google_drive/file_types.py
@ -1,5 +1,7 @@
 """File type handlers for Google Drive."""
 from app.etl_pipeline.file_classifier import should_skip_for_service
 GOOGLE_DOC = "application/vnd.google-apps.document"
 GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
 GOOGLE_SLIDE = "application/vnd.google-apps.presentation"
@ -46,6 +48,21 @@ def should_skip_file(mime_type: str) -> bool:
    return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
 def should_skip_by_extension(filename: str) -> tuple[bool, str | None]:
    """Check if the file extension is not parseable by the configured ETL service.
    Returns (should_skip, unsupported_extension_or_None).
    """
    from pathlib import PurePosixPath
    from app.config import config as app_config
    if should_skip_for_service(filename, app_config.ETL_SERVICE):
        ext = PurePosixPath(filename).suffix.lower()
        return True, ext
    return False, None
 def get_export_mime_type(mime_type: str) -> str | None:
    """Get export MIME type for Google Workspace files."""
    return EXPORT_FORMATS.get(mime_type)
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@ -1,16 +1,9 @@
-"""Content extraction for OneDrive files.
+"""Content extraction for OneDrive files."""
 Reuses the same ETL parsing logic as Google Drive since file parsing is
 extension-based, not provider-specific.
 """
 import asyncio
 import contextlib
 import logging
 import os
 import tempfile
 import threading
 import time
 from pathlib import Path
 from typing import Any
@ -31,7 +24,8 @@ async def download_and_extract_content(
    item_id = file.get("id")
    file_name = file.get("name", "Unknown")
-    if should_skip_file(file):
+    skip, _unsup_ext = should_skip_file(file)
    if skip:
        return None, {}, "Skipping non-indexable item"
    file_info = file.get("file", {})
@ -84,98 +78,11 @@ async def download_and_extract_content(
 async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
-    """Parse a local file to markdown using the configured ETL service.
+    """Parse a local file to markdown using the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    Same logic as Google Drive -- file parsing is extension-based.
+    result = await EtlPipelineService().extract(
-    """
+        EtlRequest(file_path=file_path, filename=filename)
    lower = filename.lower()
    if lower.endswith((".md", ".markdown", ".txt")):
        with open(file_path, encoding="utf-8") as f:
            return f.read()
    if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
        from litellm import atranscription
        from app.config import config as app_config
        stt_service_type = (
            "local"
            if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
            else "external"
    )
-        if stt_service_type == "local":
+    return result.markdown_content
            from app.services.stt_service import stt_service
            t0 = time.monotonic()
            logger.info(
                f"[local-stt] START file={filename} thread={threading.current_thread().name}"
            )
            result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
            logger.info(
                f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
            )
            text = result.get("text", "")
        else:
            with open(file_path, "rb") as audio_file:
                kwargs: dict[str, Any] = {
                    "model": app_config.STT_SERVICE,
                    "file": audio_file,
                    "api_key": app_config.STT_SERVICE_API_KEY,
                }
                if app_config.STT_SERVICE_API_BASE:
                    kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
                resp = await atranscription(**kwargs)
                text = resp.get("text", "")
        if not text:
            raise ValueError("Transcription returned empty text")
        return f"# Transcription of {filename}\n\n{text}"
    from app.config import config as app_config
    if app_config.ETL_SERVICE == "UNSTRUCTURED":
        from langchain_unstructured import UnstructuredLoader
        from app.utils.document_converters import convert_document_to_markdown
        loader = UnstructuredLoader(
            file_path,
            mode="elements",
            post_processors=[],
            languages=["eng"],
            include_orig_elements=False,
            include_metadata=False,
            strategy="auto",
        )
        docs = await loader.aload()
        return await convert_document_to_markdown(docs)
    if app_config.ETL_SERVICE == "LLAMACLOUD":
        from app.tasks.document_processors.file_processors import (
            parse_with_llamacloud_retry,
        )
        result = await parse_with_llamacloud_retry(
            file_path=file_path, estimated_pages=50
        )
        markdown_documents = await result.aget_markdown_documents(split_by_page=False)
        if not markdown_documents:
            raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
        return markdown_documents[0].text
    if app_config.ETL_SERVICE == "DOCLING":
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        t0 = time.monotonic()
        logger.info(
            f"[docling] START file={filename} thread={threading.current_thread().name}"
        )
        result = await asyncio.to_thread(converter.convert, file_path)
        logger.info(
            f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
        )
        return result.document.export_to_markdown()
    raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
--- a/surfsense_backend/app/connectors/onedrive/file_types.py
+++ b/surfsense_backend/app/connectors/onedrive/file_types.py
@ -1,5 +1,7 @@
 """File type handlers for Microsoft OneDrive."""
 from app.etl_pipeline.file_classifier import should_skip_for_service
 ONEDRIVE_FOLDER_FACET = "folder"
 ONENOTE_MIME = "application/msonenote"
@ -38,13 +40,28 @@ def is_folder(item: dict) -> bool:
    return ONEDRIVE_FOLDER_FACET in item
-def should_skip_file(item: dict) -> bool:
+def should_skip_file(item: dict) -> tuple[bool, str | None]:
-    """Skip folders, OneNote files, remote items (shared links), and packages."""
+    """Skip folders, OneNote files, remote items, packages, and unsupported extensions.
    Returns (should_skip, unsupported_extension_or_None).
    The second element is only set when the skip is due to an unsupported extension.
    """
    if is_folder(item):
-        return True
+        return True, None
    if "remoteItem" in item:
-        return True
+        return True, None
    if "package" in item:
-        return True
+        return True, None
    mime = item.get("file", {}).get("mimeType", "")
-    return mime in SKIP_MIME_TYPES
+    if mime in SKIP_MIME_TYPES:
        return True, None
    from pathlib import PurePosixPath
    from app.config import config as app_config
    name = item.get("name", "")
    if should_skip_for_service(name, app_config.ETL_SERVICE):
        ext = PurePosixPath(name).suffix.lower()
        return True, ext
    return False, None
--- a/surfsense_backend/app/connectors/onedrive/folder_manager.py
+++ b/surfsense_backend/app/connectors/onedrive/folder_manager.py
@ -71,7 +71,9 @@ async def get_files_in_folder(
                        )
                        continue
                    files.extend(sub_files)
-            elif not should_skip_file(item):
+            else:
                skip, _unsup_ext = should_skip_file(item)
                if not skip:
                    files.append(item)
        return files, None
--- a/surfsense_backend/app/etl_pipeline/init.py
+++ b/surfsense_backend/app/etl_pipeline/init.py
--- a/surfsense_backend/app/etl_pipeline/constants.py
+++ b/surfsense_backend/app/etl_pipeline/constants.py
@ -0,0 +1,39 @@
 import ssl
 import httpx
 LLAMACLOUD_MAX_RETRIES = 5
 LLAMACLOUD_BASE_DELAY = 10
 LLAMACLOUD_MAX_DELAY = 120
 LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
    ssl.SSLError,
    httpx.ConnectError,
    httpx.ConnectTimeout,
    httpx.ReadError,
    httpx.ReadTimeout,
    httpx.WriteError,
    httpx.WriteTimeout,
    httpx.RemoteProtocolError,
    httpx.LocalProtocolError,
    ConnectionError,
    ConnectionResetError,
    TimeoutError,
    OSError,
 )
 UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024
 MIN_UPLOAD_TIMEOUT = 120
 MAX_UPLOAD_TIMEOUT = 1800
 BASE_JOB_TIMEOUT = 600
 PER_PAGE_JOB_TIMEOUT = 60
 def calculate_upload_timeout(file_size_bytes: int) -> float:
    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
 def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
    return max(page_based_timeout, size_based_timeout)
--- a/surfsense_backend/app/etl_pipeline/etl_document.py
+++ b/surfsense_backend/app/etl_pipeline/etl_document.py
@ -0,0 +1,21 @@
 from pydantic import BaseModel, field_validator
 class EtlRequest(BaseModel):
    file_path: str
    filename: str
    estimated_pages: int = 0
    @field_validator("filename")
    @classmethod
    def filename_must_not_be_empty(cls, v: str) -> str:
        if not v.strip():
            raise ValueError("filename must not be empty")
        return v
 class EtlResult(BaseModel):
    markdown_content: str
    etl_service: str
    actual_pages: int = 0
    content_type: str
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -0,0 +1,90 @@
 from app.config import config as app_config
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
 from app.etl_pipeline.exceptions import (
    EtlServiceUnavailableError,
    EtlUnsupportedFileError,
 )
 from app.etl_pipeline.file_classifier import FileCategory, classify_file
 from app.etl_pipeline.parsers.audio import transcribe_audio
 from app.etl_pipeline.parsers.direct_convert import convert_file_directly
 from app.etl_pipeline.parsers.plaintext import read_plaintext
 class EtlPipelineService:
    """Single pipeline for extracting markdown from files. All callers use this."""
    async def extract(self, request: EtlRequest) -> EtlResult:
        category = classify_file(request.filename)
        if category == FileCategory.UNSUPPORTED:
            raise EtlUnsupportedFileError(
                f"File type not supported for parsing: {request.filename}"
            )
        if category == FileCategory.PLAINTEXT:
            content = read_plaintext(request.file_path)
            return EtlResult(
                markdown_content=content,
                etl_service="PLAINTEXT",
                content_type="plaintext",
            )
        if category == FileCategory.DIRECT_CONVERT:
            content = convert_file_directly(request.file_path, request.filename)
            return EtlResult(
                markdown_content=content,
                etl_service="DIRECT_CONVERT",
                content_type="direct_convert",
            )
        if category == FileCategory.AUDIO:
            content = await transcribe_audio(request.file_path, request.filename)
            return EtlResult(
                markdown_content=content,
                etl_service="AUDIO",
                content_type="audio",
            )
        return await self._extract_document(request)
    async def _extract_document(self, request: EtlRequest) -> EtlResult:
        from pathlib import PurePosixPath
        from app.utils.file_extensions import get_document_extensions_for_service
        etl_service = app_config.ETL_SERVICE
        if not etl_service:
            raise EtlServiceUnavailableError(
                "No ETL_SERVICE configured. "
                "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
            )
        ext = PurePosixPath(request.filename).suffix.lower()
        supported = get_document_extensions_for_service(etl_service)
        if ext not in supported:
            raise EtlUnsupportedFileError(
                f"File type {ext} is not supported by {etl_service}"
            )
        if etl_service == "DOCLING":
            from app.etl_pipeline.parsers.docling import parse_with_docling
            content = await parse_with_docling(request.file_path, request.filename)
        elif etl_service == "UNSTRUCTURED":
            from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
            content = await parse_with_unstructured(request.file_path)
        elif etl_service == "LLAMACLOUD":
            from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
            content = await parse_with_llamacloud(
                request.file_path, request.estimated_pages
            )
        else:
            raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
        return EtlResult(
            markdown_content=content,
            etl_service=etl_service,
            content_type="document",
        )
--- a/surfsense_backend/app/etl_pipeline/exceptions.py
+++ b/surfsense_backend/app/etl_pipeline/exceptions.py
@ -0,0 +1,10 @@
 class EtlParseError(Exception):
    """Raised when an ETL parser fails to produce content."""
 class EtlServiceUnavailableError(Exception):
    """Raised when the configured ETL_SERVICE is not recognised."""
 class EtlUnsupportedFileError(Exception):
    """Raised when a file type cannot be parsed by any ETL pipeline."""
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -0,0 +1,137 @@
 from enum import Enum
 from pathlib import PurePosixPath
 from app.utils.file_extensions import (
    DOCUMENT_EXTENSIONS,
    get_document_extensions_for_service,
 )
 PLAINTEXT_EXTENSIONS = frozenset(
    {
        ".md",
        ".markdown",
        ".txt",
        ".text",
        ".json",
        ".jsonl",
        ".yaml",
        ".yml",
        ".toml",
        ".ini",
        ".cfg",
        ".conf",
        ".xml",
        ".css",
        ".scss",
        ".less",
        ".sass",
        ".py",
        ".pyw",
        ".pyi",
        ".pyx",
        ".js",
        ".jsx",
        ".ts",
        ".tsx",
        ".mjs",
        ".cjs",
        ".java",
        ".kt",
        ".kts",
        ".scala",
        ".groovy",
        ".c",
        ".h",
        ".cpp",
        ".cxx",
        ".cc",
        ".hpp",
        ".hxx",
        ".cs",
        ".fs",
        ".fsx",
        ".go",
        ".rs",
        ".rb",
        ".php",
        ".pl",
        ".pm",
        ".lua",
        ".swift",
        ".m",
        ".mm",
        ".r",
        ".jl",
        ".sh",
        ".bash",
        ".zsh",
        ".fish",
        ".bat",
        ".cmd",
        ".ps1",
        ".sql",
        ".graphql",
        ".gql",
        ".env",
        ".gitignore",
        ".dockerignore",
        ".editorconfig",
        ".makefile",
        ".cmake",
        ".log",
        ".rst",
        ".tex",
        ".bib",
        ".org",
        ".adoc",
        ".asciidoc",
        ".vue",
        ".svelte",
        ".astro",
        ".tf",
        ".hcl",
        ".proto",
    }
 )
 AUDIO_EXTENSIONS = frozenset(
    {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
 )
 DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
 class FileCategory(Enum):
    PLAINTEXT = "plaintext"
    AUDIO = "audio"
    DIRECT_CONVERT = "direct_convert"
    UNSUPPORTED = "unsupported"
    DOCUMENT = "document"
 def classify_file(filename: str) -> FileCategory:
    suffix = PurePosixPath(filename).suffix.lower()
    if suffix in PLAINTEXT_EXTENSIONS:
        return FileCategory.PLAINTEXT
    if suffix in AUDIO_EXTENSIONS:
        return FileCategory.AUDIO
    if suffix in DIRECT_CONVERT_EXTENSIONS:
        return FileCategory.DIRECT_CONVERT
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
 def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
    """Return True if *filename* cannot be processed by *etl_service*.
    Plaintext, audio, and direct-convert files are parser-agnostic and never
    skipped.  Document files are checked against the per-parser extension set.
    """
    category = classify_file(filename)
    if category == FileCategory.UNSUPPORTED:
        return True
    if category == FileCategory.DOCUMENT:
        suffix = PurePosixPath(filename).suffix.lower()
        return suffix not in get_document_extensions_for_service(etl_service)
    return False
--- a/surfsense_backend/app/etl_pipeline/parsers/init.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/init.py
--- a/surfsense_backend/app/etl_pipeline/parsers/audio.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/audio.py
@ -0,0 +1,34 @@
 from litellm import atranscription
 from app.config import config as app_config
 async def transcribe_audio(file_path: str, filename: str) -> str:
    stt_service_type = (
        "local"
        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
        else "external"
    )
    if stt_service_type == "local":
        from app.services.stt_service import stt_service
        result = stt_service.transcribe_file(file_path)
        text = result.get("text", "")
        if not text:
            raise ValueError("Transcription returned empty text")
    else:
        with open(file_path, "rb") as audio_file:
            kwargs: dict = {
                "model": app_config.STT_SERVICE,
                "file": audio_file,
                "api_key": app_config.STT_SERVICE_API_KEY,
            }
            if app_config.STT_SERVICE_API_BASE:
                kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
            response = await atranscription(**kwargs)
            text = response.get("text", "")
            if not text:
                raise ValueError("Transcription returned empty text")
    return f"# Transcription of {filename}\n\n{text}"
--- a/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py
@ -0,0 +1,3 @@
 from app.tasks.document_processors._direct_converters import convert_file_directly
 __all__ = ["convert_file_directly"]
--- a/surfsense_backend/app/etl_pipeline/parsers/docling.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/docling.py
@ -0,0 +1,26 @@
 import warnings
 from logging import ERROR, getLogger
 async def parse_with_docling(file_path: str, filename: str) -> str:
    from app.services.docling_service import create_docling_service
    docling_service = create_docling_service()
    pdfminer_logger = getLogger("pdfminer")
    original_level = pdfminer_logger.level
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
        warnings.filterwarnings(
            "ignore", message=".*Cannot set gray non-stroke color.*"
        )
        warnings.filterwarnings("ignore", message=".*invalid float value.*")
        pdfminer_logger.setLevel(ERROR)
        try:
            result = await docling_service.process_document(file_path, filename)
        finally:
            pdfminer_logger.setLevel(original_level)
    return result["content"]
--- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py
@ -0,0 +1,123 @@
 import asyncio
 import logging
 import os
 import random
 import httpx
 from app.config import config as app_config
 from app.etl_pipeline.constants import (
    LLAMACLOUD_BASE_DELAY,
    LLAMACLOUD_MAX_DELAY,
    LLAMACLOUD_MAX_RETRIES,
    LLAMACLOUD_RETRYABLE_EXCEPTIONS,
    PER_PAGE_JOB_TIMEOUT,
    calculate_job_timeout,
    calculate_upload_timeout,
 )
 async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
    from llama_cloud_services import LlamaParse
    from llama_cloud_services.parse.utils import ResultType
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
    upload_timeout = calculate_upload_timeout(file_size_bytes)
    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
    custom_timeout = httpx.Timeout(
        connect=120.0,
        read=upload_timeout,
        write=upload_timeout,
        pool=120.0,
    )
    logging.info(
        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
        f"job_timeout={job_timeout:.0f}s"
    )
    last_exception = None
    attempt_errors: list[str] = []
    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
        try:
            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
                parser = LlamaParse(
                    api_key=app_config.LLAMA_CLOUD_API_KEY,
                    num_workers=1,
                    verbose=True,
                    language="en",
                    result_type=ResultType.MD,
                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
                    job_timeout_in_seconds=job_timeout,
                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
                    custom_client=custom_client,
                )
                result = await parser.aparse(file_path)
                if attempt > 1:
                    logging.info(
                        f"LlamaCloud upload succeeded on attempt {attempt} after "
                        f"{len(attempt_errors)} failures"
                    )
                if hasattr(result, "get_markdown_documents"):
                    markdown_docs = result.get_markdown_documents(split_by_page=False)
                    if markdown_docs and hasattr(markdown_docs[0], "text"):
                        return markdown_docs[0].text
                    if hasattr(result, "pages") and result.pages:
                        return "\n\n".join(
                            p.md for p in result.pages if hasattr(p, "md") and p.md
                        )
                    return str(result)
                if isinstance(result, list):
                    if result and hasattr(result[0], "text"):
                        return result[0].text
                    return "\n\n".join(
                        doc.page_content if hasattr(doc, "page_content") else str(doc)
                        for doc in result
                    )
                return str(result)
        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
            last_exception = e
            error_type = type(e).__name__
            error_msg = str(e)[:200]
            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
            if attempt < LLAMACLOUD_MAX_RETRIES:
                base_delay = min(
                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
                    LLAMACLOUD_MAX_DELAY,
                )
                jitter = base_delay * 0.25 * (2 * random.random() - 1)
                delay = base_delay + jitter
                logging.warning(
                    f"LlamaCloud upload failed "
                    f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
                    f"{error_type}. File: {file_size_mb:.1f}MB. "
                    f"Retrying in {delay:.0f}s..."
                )
                await asyncio.sleep(delay)
            else:
                logging.error(
                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
                    f"attempts. File size: {file_size_mb:.1f}MB, "
                    f"Pages: {estimated_pages}. "
                    f"Errors: {'; '.join(attempt_errors)}"
                )
        except Exception:
            raise
    raise last_exception or RuntimeError(
        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
        f"File size: {file_size_mb:.1f}MB"
    )
--- a/surfsense_backend/app/etl_pipeline/parsers/plaintext.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py
@ -0,0 +1,8 @@
 def read_plaintext(file_path: str) -> str:
    with open(file_path, encoding="utf-8", errors="replace") as f:
        content = f.read()
    if "\x00" in content:
        raise ValueError(
            f"File contains null bytes — likely a binary file opened as text: {file_path}"
        )
    return content
--- a/surfsense_backend/app/etl_pipeline/parsers/unstructured.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py
@ -0,0 +1,14 @@
 async def parse_with_unstructured(file_path: str) -> str:
    from langchain_unstructured import UnstructuredLoader
    loader = UnstructuredLoader(
        file_path,
        mode="elements",
        post_processors=[],
        languages=["eng"],
        include_orig_elements=False,
        include_metadata=False,
        strategy="auto",
    )
    docs = await loader.aload()
    return "\n\n".join(doc.page_content for doc in docs if doc.page_content)
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ b/surfsense_backend/app/routes/autocomplete_routes.py
@ -1,4 +1,4 @@
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
@ -31,8 +31,11 @@ async def vision_autocomplete_stream(
    return StreamingResponse(
        stream_vision_autocomplete(
-            body.screenshot, body.search_space_id, session,
+            body.screenshot,
-            app_name=body.app_name, window_title=body.window_title,
+            body.search_space_id,
            session,
            app_name=body.app_name,
            window_title=body.window_title,
        ),
        media_type="text/event-stream",
        headers={
--- a/surfsense_backend/app/routes/dropbox_add_connector_route.py
+++ b/surfsense_backend/app/routes/dropbox_add_connector_route.py
@ -311,9 +311,11 @@ async def dropbox_callback(
                )
            existing_cursor = db_connector.config.get("cursor")
            existing_folder_cursors = db_connector.config.get("folder_cursors")
            db_connector.config = {
                **connector_config,
                "cursor": existing_cursor,
                "folder_cursors": existing_folder_cursors,
                "auth_expired": False,
            }
            flag_modified(db_connector, "config")
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -2477,6 +2477,8 @@ async def run_google_drive_indexing(
                stage="fetching",
            )
        total_unsupported = 0
        # Index each folder with indexing options
        for folder in items.folders:
            try:
@ -2484,6 +2486,7 @@ async def run_google_drive_indexing(
                    indexed_count,
                    skipped_count,
                    error_message,
                    unsupported_count,
                ) = await index_google_drive_files(
                    session,
                    connector_id,
@ -2497,6 +2500,7 @@ async def run_google_drive_indexing(
                    include_subfolders=indexing_options.include_subfolders,
                )
                total_skipped += skipped_count
                total_unsupported += unsupported_count
                if error_message:
                    errors.append(f"Folder '{folder.name}': {error_message}")
                else:
@ -2572,6 +2576,7 @@ async def run_google_drive_indexing(
                indexed_count=total_indexed,
                error_message=error_message,
                skipped_count=total_skipped,
                unsupported_count=total_unsupported,
            )
    except Exception as e:
@ -2642,7 +2647,12 @@ async def run_onedrive_indexing(
                stage="fetching",
            )
-        total_indexed, total_skipped, error_message = await index_onedrive_files(
+        (
            total_indexed,
            total_skipped,
            error_message,
            total_unsupported,
        ) = await index_onedrive_files(
            session,
            connector_id,
            search_space_id,
@ -2683,6 +2693,7 @@ async def run_onedrive_indexing(
                indexed_count=total_indexed,
                error_message=error_message,
                skipped_count=total_skipped,
                unsupported_count=total_unsupported,
            )
    except Exception as e:
@ -2750,7 +2761,12 @@ async def run_dropbox_indexing(
                stage="fetching",
            )
-        total_indexed, total_skipped, error_message = await index_dropbox_files(
+        (
            total_indexed,
            total_skipped,
            error_message,
            total_unsupported,
        ) = await index_dropbox_files(
            session,
            connector_id,
            search_space_id,
@ -2791,6 +2807,7 @@ async def run_dropbox_indexing(
                indexed_count=total_indexed,
                error_message=error_message,
                skipped_count=total_skipped,
                unsupported_count=total_unsupported,
            )
    except Exception as e:
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@ -111,9 +111,8 @@ class DoclingService:
                pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
            )
            # Initialize DocumentConverter
            self.converter = DocumentConverter(
-                format_options={InputFormat.PDF: pdf_format_option}
+                format_options={InputFormat.PDF: pdf_format_option},
            )
            acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
--- a/surfsense_backend/app/services/notification_service.py
+++ b/surfsense_backend/app/services/notification_service.py
@ -421,6 +421,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
        error_message: str | None = None,
        is_warning: bool = False,
        skipped_count: int | None = None,
        unsupported_count: int | None = None,
    ) -> Notification:
        """
        Update notification when connector indexing completes.
@ -428,10 +429,11 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
        Args:
            session: Database session
            notification: Notification to update
-            indexed_count: Total number of items indexed
+            indexed_count: Total number of files indexed
            error_message: Error message if indexing failed, or warning message (optional)
            is_warning: If True, treat error_message as a warning (success case) rather than an error
-            skipped_count: Number of items skipped (e.g., duplicates) - optional
+            skipped_count: Number of files skipped (e.g., unchanged) - optional
            unsupported_count: Number of files skipped because the ETL parser doesn't support them
        Returns:
            Updated notification
@ -440,52 +442,45 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
            "connector_name", "Connector"
        )
-        # Build the skipped text if there are skipped items
+        unsupported_text = ""
-        skipped_text = ""
+        if unsupported_count and unsupported_count > 0:
-        if skipped_count and skipped_count > 0:
+            file_word = "file was" if unsupported_count == 1 else "files were"
-            skipped_item_text = "item" if skipped_count == 1 else "items"
+            unsupported_text = f" {unsupported_count} {file_word} not supported."
            skipped_text = (
                f" ({skipped_count} {skipped_item_text} skipped - already indexed)"
            )
        # If there's an error message but items were indexed, treat it as a warning (partial success)
        # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
        # Otherwise, treat it as a failure
        if error_message:
            if indexed_count > 0:
                # Partial success with warnings (e.g., duplicate content from other connectors)
                title = f"Ready: {connector_name}"
-                item_text = "item" if indexed_count == 1 else "items"
+                file_text = "file" if indexed_count == 1 else "files"
-                message = f"Now searchable! {indexed_count} {item_text} synced{skipped_text}. Note: {error_message}"
+                message = f"Now searchable! {indexed_count} {file_text} synced.{unsupported_text} Note: {error_message}"
                status = "completed"
            elif is_warning:
                # Warning case (e.g., duplicates found) - treat as success
                title = f"Ready: {connector_name}"
-                message = f"Sync completed{skipped_text}. {error_message}"
+                message = f"Sync complete.{unsupported_text} {error_message}"
                status = "completed"
            else:
                # Complete failure
                title = f"Failed: {connector_name}"
                message = f"Sync failed: {error_message}"
                if unsupported_text:
                    message += unsupported_text
                status = "failed"
        else:
            title = f"Ready: {connector_name}"
            if indexed_count == 0:
-                if skipped_count and skipped_count > 0:
+                if unsupported_count and unsupported_count > 0:
-                    skipped_item_text = "item" if skipped_count == 1 else "items"
+                    message = f"Sync complete.{unsupported_text}"
                    message = f"Already up to date! {skipped_count} {skipped_item_text} skipped (already indexed)."
                else:
-                    message = "Already up to date! No new items to sync."
+                    message = "Already up to date!"
            else:
-                item_text = "item" if indexed_count == 1 else "items"
+                file_text = "file" if indexed_count == 1 else "files"
-                message = (
+                message = f"Now searchable! {indexed_count} {file_text} synced."
-                    f"Now searchable! {indexed_count} {item_text} synced{skipped_text}."
+                if unsupported_text:
-                )
+                    message += unsupported_text
            status = "completed"
        metadata_updates = {
            "indexed_count": indexed_count,
            "skipped_count": skipped_count or 0,
            "unsupported_count": unsupported_count or 0,
            "sync_stage": "completed"
            if (not error_message or is_warning or indexed_count > 0)
            else "failed",
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -8,7 +8,7 @@ Optimized pipeline:
 """
 import logging
-from typing import AsyncGenerator
+from collections.abc import AsyncGenerator
 from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@ -51,7 +51,10 @@ async def _should_skip_file(
    file_id = file.get("id", "")
    file_name = file.get("name", "Unknown")
-    if skip_item(file):
+    skip, unsup_ext = skip_item(file)
    if skip:
        if unsup_ext:
            return True, f"unsupported:{unsup_ext}"
        return True, "folder/non-downloadable"
    if not file_id:
        return True, "missing file_id"
@ -251,6 +254,121 @@ async def _download_and_index(
    return batch_indexed, download_failed + batch_failed
 async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
    """Remove a document that was deleted in Dropbox."""
    primary_hash = compute_identifier_hash(
        DocumentType.DROPBOX_FILE.value, file_id, search_space_id
    )
    existing = await check_document_by_unique_identifier(session, primary_hash)
    if not existing:
        result = await session.execute(
            select(Document).where(
                Document.search_space_id == search_space_id,
                Document.document_type == DocumentType.DROPBOX_FILE,
                cast(Document.document_metadata["dropbox_file_id"], String) == file_id,
            )
        )
        existing = result.scalar_one_or_none()
    if existing:
        await session.delete(existing)
 async def _index_with_delta_sync(
    dropbox_client: DropboxClient,
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    cursor: str,
    task_logger: TaskLoggingService,
    log_entry: object,
    max_files: int,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
 ) -> tuple[int, int, int, str]:
    """Delta sync using Dropbox cursor-based change tracking.
    Returns (indexed_count, skipped_count, new_cursor).
    """
    await task_logger.log_task_progress(
        log_entry,
        f"Starting delta sync from cursor: {cursor[:20]}...",
        {"stage": "delta_sync", "cursor_prefix": cursor[:20]},
    )
    entries, new_cursor, error = await dropbox_client.get_changes(cursor)
    if error:
        err_lower = error.lower()
        if "401" in error or "authentication expired" in err_lower:
            raise Exception(
                f"Dropbox authentication failed. Please re-authenticate. (Error: {error})"
            )
        raise Exception(f"Failed to fetch Dropbox changes: {error}")
    if not entries:
        logger.info("No changes detected since last sync")
        return 0, 0, 0, new_cursor or cursor
    logger.info(f"Processing {len(entries)} change entries")
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_to_download: list[dict] = []
    files_processed = 0
    for entry in entries:
        if files_processed >= max_files:
            break
        files_processed += 1
        tag = entry.get(".tag")
        if tag == "deleted":
            path_lower = entry.get("path_lower", "")
            name = entry.get("name", "")
            file_id = entry.get("id", "")
            if file_id:
                await _remove_document(session, file_id, search_space_id)
            logger.debug(f"Processed deletion: {name or path_lower}")
            continue
        if tag != "file":
            continue
        skip, msg = await _should_skip_file(session, entry, search_space_id)
        if skip:
            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
            continue
        files_to_download.append(entry)
    batch_indexed, failed = await _download_and_index(
        dropbox_client,
        session,
        files_to_download,
        connector_id=connector_id,
        search_space_id=search_space_id,
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
    )
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
    return indexed, skipped, unsupported_count, new_cursor or cursor
 async def _index_full_scan(
    dropbox_client: DropboxClient,
    session: AsyncSession,
@ -266,8 +384,11 @@ async def _index_full_scan(
    incremental_sync: bool = True,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
-) -> tuple[int, int]:
+) -> tuple[int, int, int]:
-    """Full scan indexing of a folder."""
+    """Full scan indexing of a folder.
    Returns (indexed, skipped, unsupported_count).
    """
    await task_logger.log_task_progress(
        log_entry,
        f"Starting full scan of folder: {folder_name}",
@ -287,6 +408,7 @@ async def _index_full_scan(
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_to_download: list[dict] = []
    all_files, error = await get_files_in_folder(
@ -306,12 +428,19 @@ async def _index_full_scan(
        if incremental_sync:
            skip, msg = await _should_skip_file(session, file, search_space_id)
            if skip:
-                if msg and "renamed" in msg.lower():
+                if msg and msg.startswith("unsupported:"):
                    unsupported_count += 1
                elif msg and "renamed" in msg.lower():
                    renamed_count += 1
                else:
                    skipped += 1
                continue
-        elif skip_item(file):
+        else:
            item_skip, item_unsup = skip_item(file)
            if item_skip:
                if item_unsup:
                    unsupported_count += 1
                else:
                    skipped += 1
                continue
@ -352,9 +481,10 @@ async def _index_full_scan(
    indexed = renamed_count + batch_indexed
    logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 async def _index_selected_files(
@ -368,7 +498,7 @@ async def _index_selected_files(
    enable_summary: bool,
    incremental_sync: bool = True,
    on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
    page_limit_service = PageLimitService(session)
    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -379,6 +509,7 @@ async def _index_selected_files(
    errors: list[str] = []
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    for file_path, file_name in file_paths:
        file, error = await get_file_by_path(dropbox_client, file_path)
@ -390,12 +521,19 @@ async def _index_selected_files(
        if incremental_sync:
            skip, msg = await _should_skip_file(session, file, search_space_id)
            if skip:
-                if msg and "renamed" in msg.lower():
+                if msg and msg.startswith("unsupported:"):
                    unsupported_count += 1
                elif msg and "renamed" in msg.lower():
                    renamed_count += 1
                else:
                    skipped += 1
                continue
-        elif skip_item(file):
+        else:
            item_skip, item_unsup = skip_item(file)
            if item_skip:
                if item_unsup:
                    unsupported_count += 1
                else:
                    skipped += 1
                continue
@ -429,7 +567,7 @@ async def _index_selected_files(
            user_id, pages_to_deduct, allow_exceed=True
        )
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 async def index_dropbox_files(
@ -438,7 +576,7 @@ async def index_dropbox_files(
    search_space_id: int,
    user_id: str,
    items_dict: dict,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, str | None, int]:
    """Index Dropbox files for a specific connector.
    items_dict format:
@ -469,7 +607,7 @@ async def index_dropbox_files(
            await task_logger.log_task_failure(
                log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        token_encrypted = connector.config.get("_token_encrypted", False)
        if token_encrypted and not config.SECRET_KEY:
@ -480,7 +618,7 @@ async def index_dropbox_files(
                "Missing SECRET_KEY",
                {"error_type": "MissingSecretKey"},
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        connector_enable_summary = getattr(connector, "enable_summary", True)
        dropbox_client = DropboxClient(session, connector_id)
@ -489,9 +627,13 @@ async def index_dropbox_files(
        max_files = indexing_options.get("max_files", 500)
        incremental_sync = indexing_options.get("incremental_sync", True)
        include_subfolders = indexing_options.get("include_subfolders", True)
        use_delta_sync = indexing_options.get("use_delta_sync", True)
        folder_cursors: dict = connector.config.get("folder_cursors", {})
        total_indexed = 0
        total_skipped = 0
        total_unsupported = 0
        selected_files = items_dict.get("files", [])
        if selected_files:
@ -499,7 +641,7 @@ async def index_dropbox_files(
                (f.get("path", f.get("path_lower", f.get("id", ""))), f.get("name"))
                for f in selected_files
            ]
-            indexed, skipped, file_errors = await _index_selected_files(
+            indexed, skipped, unsupported, file_errors = await _index_selected_files(
                dropbox_client,
                session,
                file_tuples,
@ -511,6 +653,7 @@ async def index_dropbox_files(
            )
            total_indexed += indexed
            total_skipped += skipped
            total_unsupported += unsupported
            if file_errors:
                logger.warning(
                    f"File indexing errors for connector {connector_id}: {file_errors}"
@ -523,8 +666,30 @@ async def index_dropbox_files(
            )
            folder_name = folder.get("name", "Root")
            saved_cursor = folder_cursors.get(folder_path)
            can_use_delta = (
                use_delta_sync and saved_cursor and connector.last_indexed_at
            )
            if can_use_delta:
                logger.info(f"Using delta sync for folder {folder_name}")
                indexed, skipped, unsup, new_cursor = await _index_with_delta_sync(
                    dropbox_client,
                    session,
                    connector_id,
                    search_space_id,
                    user_id,
                    saved_cursor,
                    task_logger,
                    log_entry,
                    max_files,
                    enable_summary=connector_enable_summary,
                )
                folder_cursors[folder_path] = new_cursor
                total_unsupported += unsup
            else:
                logger.info(f"Using full scan for folder {folder_name}")
-            indexed, skipped = await _index_full_scan(
+                indexed, skipped, unsup = await _index_full_scan(
                    dropbox_client,
                    session,
                    connector_id,
@ -539,9 +704,28 @@ async def index_dropbox_files(
                    incremental_sync=incremental_sync,
                    enable_summary=connector_enable_summary,
                )
                total_unsupported += unsup
            total_indexed += indexed
            total_skipped += skipped
            # Persist latest cursor for this folder
            try:
                latest_cursor, cursor_err = await dropbox_client.get_latest_cursor(
                    folder_path
                )
                if latest_cursor and not cursor_err:
                    folder_cursors[folder_path] = latest_cursor
            except Exception as e:
                logger.warning(f"Failed to get latest cursor for {folder_path}: {e}")
        # Persist folder cursors to connector config
        if folders:
            cfg = dict(connector.config)
            cfg["folder_cursors"] = folder_cursors
            connector.config = cfg
            flag_modified(connector, "config")
        if total_indexed > 0 or folders:
            await update_connector_last_indexed(session, connector, True)
@ -550,12 +734,18 @@ async def index_dropbox_files(
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Dropbox indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped},
+            {
                "files_processed": total_indexed,
                "files_skipped": total_skipped,
                "files_unsupported": total_unsupported,
            },
        )
        logger.info(
-            f"Dropbox indexing completed: {total_indexed} indexed, {total_skipped} skipped"
+            f"Dropbox indexing completed: {total_indexed} indexed, "
            f"{total_skipped} skipped, {total_unsupported} unsupported"
        )
-        return total_indexed, total_skipped, None
+
        return total_indexed, total_skipped, None, total_unsupported
    except SQLAlchemyError as db_error:
        await session.rollback()
@ -566,7 +756,7 @@ async def index_dropbox_files(
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
@ -576,4 +766,4 @@ async def index_dropbox_files(
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Dropbox files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index Dropbox files: {e!s}"
+        return 0, 0, f"Failed to index Dropbox files: {e!s}", 0
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -25,7 +25,11 @@ from app.connectors.google_drive import (
    get_files_in_folder,
    get_start_page_token,
 )
-from app.connectors.google_drive.file_types import should_skip_file as skip_mime
+from app.connectors.google_drive.file_types import (
    is_google_workspace_file,
    should_skip_by_extension,
    should_skip_file as skip_mime,
 )
 from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
 from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
@ -78,6 +82,10 @@ async def _should_skip_file(
    if skip_mime(mime_type):
        return True, "folder/shortcut"
    if not is_google_workspace_file(mime_type):
        ext_skip, unsup_ext = should_skip_by_extension(file_name)
        if ext_skip:
            return True, f"unsupported:{unsup_ext}"
    if not file_id:
        return True, "missing file_id"
@ -468,13 +476,13 @@ async def _index_selected_files(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline.
    Phase 1 (serial): fetch metadata + skip checks.
    Phase 2+3 (parallel): download, ETL, index via _download_and_index.
-    Returns (indexed_count, skipped_count, errors).
+    Returns (indexed_count, skipped_count, unsupported_count, errors).
    """
    page_limit_service = PageLimitService(session)
    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -485,6 +493,7 @@ async def _index_selected_files(
    errors: list[str] = []
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    for file_id, file_name in file_ids:
        file, error = await get_file_by_id(drive_client, file_id)
@ -495,7 +504,9 @@ async def _index_selected_files(
        skip, msg = await _should_skip_file(session, file, search_space_id)
        if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
@ -539,7 +550,7 @@ async def _index_selected_files(
            user_id, pages_to_deduct, allow_exceed=True
        )
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 # ---------------------------------------------------------------------------
@ -562,8 +573,11 @@ async def _index_full_scan(
    include_subfolders: bool = False,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
-) -> tuple[int, int]:
+) -> tuple[int, int, int]:
-    """Full scan indexing of a folder."""
+    """Full scan indexing of a folder.
    Returns (indexed, skipped, unsupported_count).
    """
    await task_logger.log_task_progress(
        log_entry,
        f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
@ -585,6 +599,7 @@ async def _index_full_scan(
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_processed = 0
    files_to_download: list[dict] = []
    folders_to_process = [(folder_id, folder_name)]
@ -625,7 +640,9 @@ async def _index_full_scan(
                skip, msg = await _should_skip_file(session, file, search_space_id)
                if skip:
-                    if msg and "renamed" in msg.lower():
+                    if msg and msg.startswith("unsupported:"):
                        unsupported_count += 1
                    elif msg and "renamed" in msg.lower():
                        renamed_count += 1
                    else:
                        skipped += 1
@ -698,9 +715,10 @@ async def _index_full_scan(
    indexed = renamed_count + batch_indexed
    logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 async def _index_with_delta_sync(
@ -718,8 +736,11 @@ async def _index_with_delta_sync(
    include_subfolders: bool = False,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
-) -> tuple[int, int]:
+) -> tuple[int, int, int]:
-    """Delta sync using change tracking."""
+    """Delta sync using change tracking.
    Returns (indexed, skipped, unsupported_count).
    """
    await task_logger.log_task_progress(
        log_entry,
        f"Starting delta sync from token: {start_page_token[:20]}...",
@ -739,7 +760,7 @@ async def _index_with_delta_sync(
    if not changes:
        logger.info("No changes detected since last sync")
-        return 0, 0
+        return 0, 0, 0
    logger.info(f"Processing {len(changes)} changes")
@ -754,6 +775,7 @@ async def _index_with_delta_sync(
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_to_download: list[dict] = []
    files_processed = 0
@ -775,7 +797,9 @@ async def _index_with_delta_sync(
        skip, msg = await _should_skip_file(session, file, search_space_id)
        if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
@ -832,9 +856,10 @@ async def _index_with_delta_sync(
    indexed = renamed_count + batch_indexed
    logger.info(
-        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 # ---------------------------------------------------------------------------
@ -854,8 +879,11 @@ async def index_google_drive_files(
    max_files: int = 500,
    include_subfolders: bool = False,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, str | None, int]:
-    """Index Google Drive files for a specific connector."""
+    """Index Google Drive files for a specific connector.
    Returns (indexed, skipped, error_or_none, unsupported_count).
    """
    task_logger = TaskLoggingService(session, search_space_id)
    log_entry = await task_logger.log_task_start(
        task_name="google_drive_files_indexing",
@ -881,7 +909,7 @@ async def index_google_drive_files(
            await task_logger.log_task_failure(
                log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        await task_logger.log_task_progress(
            log_entry,
@ -900,7 +928,7 @@ async def index_google_drive_files(
                    "Missing Composio account",
                    {"error_type": "MissingComposioAccount"},
                )
-                return 0, 0, error_msg
+                return 0, 0, error_msg, 0
            pre_built_credentials = build_composio_credentials(connected_account_id)
        else:
            token_encrypted = connector.config.get("_token_encrypted", False)
@ -915,6 +943,7 @@ async def index_google_drive_files(
                    0,
                    0,
                    "SECRET_KEY not configured but credentials are marked as encrypted",
                    0,
                )
        connector_enable_summary = getattr(connector, "enable_summary", True)
@ -927,7 +956,7 @@ async def index_google_drive_files(
            await task_logger.log_task_failure(
                log_entry, error_msg, {"error_type": "MissingParameter"}
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        target_folder_id = folder_id
        target_folder_name = folder_name or "Selected Folder"
@ -938,9 +967,11 @@ async def index_google_drive_files(
            use_delta_sync and start_page_token and connector.last_indexed_at
        )
        documents_unsupported = 0
        if can_use_delta:
            logger.info(f"Using delta sync for connector {connector_id}")
-            documents_indexed, documents_skipped = await _index_with_delta_sync(
+            documents_indexed, documents_skipped, du = await _index_with_delta_sync(
                drive_client,
                session,
                connector,
@ -956,8 +987,9 @@ async def index_google_drive_files(
                on_heartbeat_callback,
                connector_enable_summary,
            )
            documents_unsupported += du
            logger.info("Running reconciliation scan after delta sync")
-            ri, rs = await _index_full_scan(
+            ri, rs, ru = await _index_full_scan(
                drive_client,
                session,
                connector,
@ -975,9 +1007,14 @@ async def index_google_drive_files(
            )
            documents_indexed += ri
            documents_skipped += rs
            documents_unsupported += ru
        else:
            logger.info(f"Using full scan for connector {connector_id}")
-            documents_indexed, documents_skipped = await _index_full_scan(
+            (
                documents_indexed,
                documents_skipped,
                documents_unsupported,
            ) = await _index_full_scan(
                drive_client,
                session,
                connector,
@ -1012,14 +1049,17 @@ async def index_google_drive_files(
            {
                "files_processed": documents_indexed,
                "files_skipped": documents_skipped,
                "files_unsupported": documents_unsupported,
                "sync_type": "delta" if can_use_delta else "full",
                "folder": target_folder_name,
            },
        )
        logger.info(
-            f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped"
+            f"Google Drive indexing completed: {documents_indexed} indexed, "
            f"{documents_skipped} skipped, {documents_unsupported} unsupported"
        )
-        return documents_indexed, documents_skipped, None
+
        return documents_indexed, documents_skipped, None, documents_unsupported
    except SQLAlchemyError as db_error:
        await session.rollback()
@ -1030,7 +1070,7 @@ async def index_google_drive_files(
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
@ -1040,7 +1080,7 @@ async def index_google_drive_files(
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index Google Drive files: {e!s}"
+        return 0, 0, f"Failed to index Google Drive files: {e!s}", 0
 async def index_google_drive_single_file(
@ -1242,7 +1282,7 @@ async def index_google_drive_selected_files(
            session, connector_id, credentials=pre_built_credentials
        )
-        indexed, skipped, errors = await _index_selected_files(
+        indexed, skipped, unsupported, errors = await _index_selected_files(
            drive_client,
            session,
            files,
@ -1253,6 +1293,11 @@ async def index_google_drive_selected_files(
            on_heartbeat=on_heartbeat_callback,
        )
        if unsupported > 0:
            file_text = "file was" if unsupported == 1 else "files were"
            unsup_msg = f"{unsupported} {file_text} not supported"
            errors.append(unsup_msg)
        await session.commit()
        if errors:
@ -1260,7 +1305,12 @@ async def index_google_drive_selected_files(
                log_entry,
                f"Batch file indexing completed with {len(errors)} error(s)",
                "; ".join(errors),
-                {"indexed": indexed, "skipped": skipped, "error_count": len(errors)},
+                {
                    "indexed": indexed,
                    "skipped": skipped,
                    "unsupported": unsupported,
                    "error_count": len(errors),
                },
            )
        else:
            await task_logger.log_task_success(
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@ -23,7 +23,6 @@ from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError, SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.db import (
    Document,
    DocumentStatus,
@ -44,132 +43,6 @@ from .base import (
    logger,
 )
 PLAINTEXT_EXTENSIONS = frozenset(
    {
        ".md",
        ".markdown",
        ".txt",
        ".text",
        ".json",
        ".jsonl",
        ".yaml",
        ".yml",
        ".toml",
        ".ini",
        ".cfg",
        ".conf",
        ".xml",
        ".css",
        ".scss",
        ".less",
        ".sass",
        ".py",
        ".pyw",
        ".pyi",
        ".pyx",
        ".js",
        ".jsx",
        ".ts",
        ".tsx",
        ".mjs",
        ".cjs",
        ".java",
        ".kt",
        ".kts",
        ".scala",
        ".groovy",
        ".c",
        ".h",
        ".cpp",
        ".cxx",
        ".cc",
        ".hpp",
        ".hxx",
        ".cs",
        ".fs",
        ".fsx",
        ".go",
        ".rs",
        ".rb",
        ".php",
        ".pl",
        ".pm",
        ".lua",
        ".swift",
        ".m",
        ".mm",
        ".r",
        ".R",
        ".jl",
        ".sh",
        ".bash",
        ".zsh",
        ".fish",
        ".bat",
        ".cmd",
        ".ps1",
        ".sql",
        ".graphql",
        ".gql",
        ".env",
        ".gitignore",
        ".dockerignore",
        ".editorconfig",
        ".makefile",
        ".cmake",
        ".log",
        ".rst",
        ".tex",
        ".bib",
        ".org",
        ".adoc",
        ".asciidoc",
        ".vue",
        ".svelte",
        ".astro",
        ".tf",
        ".hcl",
        ".proto",
    }
 )
 AUDIO_EXTENSIONS = frozenset(
    {
        ".mp3",
        ".mp4",
        ".mpeg",
        ".mpga",
        ".m4a",
        ".wav",
        ".webm",
    }
 )
 DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
 def _is_plaintext_file(filename: str) -> bool:
    return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS
 def _is_audio_file(filename: str) -> bool:
    return Path(filename).suffix.lower() in AUDIO_EXTENSIONS
 def _is_direct_convert_file(filename: str) -> bool:
    return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS
 def _needs_etl(filename: str) -> bool:
    """File is not plaintext, not audio, and not direct-convert — requires ETL."""
    return (
        not _is_plaintext_file(filename)
        and not _is_audio_file(filename)
        and not _is_direct_convert_file(filename)
    )
 HeartbeatCallbackType = Callable[[int], Awaitable[None]]
@ -279,57 +152,19 @@ def scan_folder(
    return files
 def _read_plaintext_file(file_path: str) -> str:
    """Read a plaintext/text-based file as UTF-8."""
    with open(file_path, encoding="utf-8", errors="replace") as f:
        content = f.read()
    if "\x00" in content:
        raise ValueError(
            f"File contains null bytes — likely a binary file opened as text: {file_path}"
        )
    return content
 async def _read_file_content(file_path: str, filename: str) -> str:
-    """Read file content, using ETL for binary formats.
+    """Read file content via the unified ETL pipeline.
-    Plaintext files are read directly. Audio and document files (PDF, DOCX, etc.)
+    All file types (plaintext, audio, direct-convert, document) are handled
-    are routed through the configured ETL service (same as Google Drive / OneDrive).
+    by ``EtlPipelineService``.
    Raises ValueError if the file cannot be parsed (e.g. no ETL service configured
    for a binary file).
    """
-    if _is_plaintext_file(filename):
+    from app.etl_pipeline.etl_document import EtlRequest
-        return _read_plaintext_file(file_path)
+    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    if _is_direct_convert_file(filename):
+    result = await EtlPipelineService().extract(
-        from app.tasks.document_processors._direct_converters import (
+        EtlRequest(file_path=file_path, filename=filename)
            convert_file_directly,
    )
-
+    return result.markdown_content
        return convert_file_directly(file_path, filename)
    if _is_audio_file(filename):
        etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None
        stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None
        if not stt_service_val and not etl_service:
            raise ValueError(
                f"No STT_SERVICE configured — cannot transcribe audio file: {filename}"
            )
    if _needs_etl(filename):
        etl_service = getattr(config, "ETL_SERVICE", None)
        if not etl_service:
            raise ValueError(
                f"No ETL_SERVICE configured — cannot parse binary file: {filename}. "
                f"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
            )
    from app.connectors.onedrive.content_extractor import (
        _parse_file_to_markdown,
    )
    return await _parse_file_to_markdown(file_path, filename)
 def _content_hash(content: str, search_space_id: int) -> str:
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@ -56,7 +56,10 @@ async def _should_skip_file(
    file_id = file.get("id")
    file_name = file.get("name", "Unknown")
-    if skip_item(file):
+    skip, unsup_ext = skip_item(file)
    if skip:
        if unsup_ext:
            return True, f"unsupported:{unsup_ext}"
        return True, "folder/onenote/remote"
    if not file_id:
        return True, "missing file_id"
@ -290,7 +293,7 @@ async def _index_selected_files(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
-) -> tuple[int, int, list[str]]:
+) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
    page_limit_service = PageLimitService(session)
    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -301,6 +304,7 @@ async def _index_selected_files(
    errors: list[str] = []
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    for file_id, file_name in file_ids:
        file, error = await get_file_by_id(onedrive_client, file_id)
@ -311,7 +315,9 @@ async def _index_selected_files(
        skip, msg = await _should_skip_file(session, file, search_space_id)
        if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
@ -347,7 +353,7 @@ async def _index_selected_files(
            user_id, pages_to_deduct, allow_exceed=True
        )
-    return renamed_count + batch_indexed, skipped, errors
+    return renamed_count + batch_indexed, skipped, unsupported_count, errors
 # ---------------------------------------------------------------------------
@ -369,8 +375,11 @@ async def _index_full_scan(
    include_subfolders: bool = True,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
-) -> tuple[int, int]:
+) -> tuple[int, int, int]:
-    """Full scan indexing of a folder."""
+    """Full scan indexing of a folder.
    Returns (indexed, skipped, unsupported_count).
    """
    await task_logger.log_task_progress(
        log_entry,
        f"Starting full scan of folder: {folder_name}",
@ -389,6 +398,7 @@ async def _index_full_scan(
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_to_download: list[dict] = []
    all_files, error = await get_files_in_folder(
@ -407,7 +417,9 @@ async def _index_full_scan(
    for file in all_files[:max_files]:
        skip, msg = await _should_skip_file(session, file, search_space_id)
        if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
@ -450,9 +462,10 @@ async def _index_full_scan(
    indexed = renamed_count + batch_indexed
    logger.info(
-        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Full scan complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
-    return indexed, skipped
+    return indexed, skipped, unsupported_count
 async def _index_with_delta_sync(
@ -468,8 +481,11 @@ async def _index_with_delta_sync(
    max_files: int,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, int, str | None]:
-    """Delta sync using OneDrive change tracking. Returns (indexed, skipped, new_delta_link)."""
+    """Delta sync using OneDrive change tracking.
    Returns (indexed, skipped, unsupported_count, new_delta_link).
    """
    await task_logger.log_task_progress(
        log_entry,
        "Starting delta sync",
@ -489,7 +505,7 @@ async def _index_with_delta_sync(
    if not changes:
        logger.info("No changes detected since last sync")
-        return 0, 0, new_delta_link
+        return 0, 0, 0, new_delta_link
    logger.info(f"Processing {len(changes)} delta changes")
@ -501,6 +517,7 @@ async def _index_with_delta_sync(
    renamed_count = 0
    skipped = 0
    unsupported_count = 0
    files_to_download: list[dict] = []
    files_processed = 0
@ -523,7 +540,9 @@ async def _index_with_delta_sync(
        skip, msg = await _should_skip_file(session, change, search_space_id)
        if skip:
-            if msg and "renamed" in msg.lower():
+            if msg and msg.startswith("unsupported:"):
                unsupported_count += 1
            elif msg and "renamed" in msg.lower():
                renamed_count += 1
            else:
                skipped += 1
@ -566,9 +585,10 @@ async def _index_with_delta_sync(
    indexed = renamed_count + batch_indexed
    logger.info(
-        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
+        f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
        f"{unsupported_count} unsupported, {failed} failed"
    )
-    return indexed, skipped, new_delta_link
+    return indexed, skipped, unsupported_count, new_delta_link
 # ---------------------------------------------------------------------------
@ -582,7 +602,7 @@ async def index_onedrive_files(
    search_space_id: int,
    user_id: str,
    items_dict: dict,
-) -> tuple[int, int, str | None]:
+) -> tuple[int, int, str | None, int]:
    """Index OneDrive files for a specific connector.
    items_dict format:
@ -609,7 +629,7 @@ async def index_onedrive_files(
            await task_logger.log_task_failure(
                log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        token_encrypted = connector.config.get("_token_encrypted", False)
        if token_encrypted and not config.SECRET_KEY:
@ -620,7 +640,7 @@ async def index_onedrive_files(
                "Missing SECRET_KEY",
                {"error_type": "MissingSecretKey"},
            )
-            return 0, 0, error_msg
+            return 0, 0, error_msg, 0
        connector_enable_summary = getattr(connector, "enable_summary", True)
        onedrive_client = OneDriveClient(session, connector_id)
@ -632,12 +652,13 @@ async def index_onedrive_files(
        total_indexed = 0
        total_skipped = 0
        total_unsupported = 0
        # Index selected individual files
        selected_files = items_dict.get("files", [])
        if selected_files:
            file_tuples = [(f["id"], f.get("name")) for f in selected_files]
-            indexed, skipped, _errors = await _index_selected_files(
+            indexed, skipped, unsupported, _errors = await _index_selected_files(
                onedrive_client,
                session,
                file_tuples,
@ -648,6 +669,7 @@ async def index_onedrive_files(
            )
            total_indexed += indexed
            total_skipped += skipped
            total_unsupported += unsupported
        # Index selected folders
        folders = items_dict.get("folders", [])
@ -661,7 +683,7 @@ async def index_onedrive_files(
            if can_use_delta:
                logger.info(f"Using delta sync for folder {folder_name}")
-                indexed, skipped, new_delta_link = await _index_with_delta_sync(
+                indexed, skipped, unsup, new_delta_link = await _index_with_delta_sync(
                    onedrive_client,
                    session,
                    connector_id,
@ -676,6 +698,7 @@ async def index_onedrive_files(
                )
                total_indexed += indexed
                total_skipped += skipped
                total_unsupported += unsup
                if new_delta_link:
                    await session.refresh(connector)
@ -685,7 +708,7 @@ async def index_onedrive_files(
                    flag_modified(connector, "config")
                # Reconciliation full scan
-                ri, rs = await _index_full_scan(
+                ri, rs, ru = await _index_full_scan(
                    onedrive_client,
                    session,
                    connector_id,
@ -701,9 +724,10 @@ async def index_onedrive_files(
                )
                total_indexed += ri
                total_skipped += rs
                total_unsupported += ru
            else:
                logger.info(f"Using full scan for folder {folder_name}")
-                indexed, skipped = await _index_full_scan(
+                indexed, skipped, unsup = await _index_full_scan(
                    onedrive_client,
                    session,
                    connector_id,
@ -719,6 +743,7 @@ async def index_onedrive_files(
                )
                total_indexed += indexed
                total_skipped += skipped
                total_unsupported += unsup
            # Store new delta link for this folder
            _, new_delta_link, _ = await onedrive_client.get_delta(folder_id=folder_id)
@ -737,12 +762,18 @@ async def index_onedrive_files(
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed OneDrive indexing for connector {connector_id}",
-            {"files_processed": total_indexed, "files_skipped": total_skipped},
+            {
                "files_processed": total_indexed,
                "files_skipped": total_skipped,
                "files_unsupported": total_unsupported,
            },
        )
        logger.info(
-            f"OneDrive indexing completed: {total_indexed} indexed, {total_skipped} skipped"
+            f"OneDrive indexing completed: {total_indexed} indexed, "
            f"{total_skipped} skipped, {total_unsupported} unsupported"
        )
-        return total_indexed, total_skipped, None
+
        return total_indexed, total_skipped, None, total_unsupported
    except SQLAlchemyError as db_error:
        await session.rollback()
@ -753,7 +784,7 @@ async def index_onedrive_files(
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
-        return 0, 0, f"Database error: {db_error!s}"
+        return 0, 0, f"Database error: {db_error!s}", 0
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
@ -763,4 +794,4 @@ async def index_onedrive_files(
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index OneDrive files: {e!s}", exc_info=True)
-        return 0, 0, f"Failed to index OneDrive files: {e!s}"
+        return 0, 0, f"Failed to index OneDrive files: {e!s}", 0
--- a/surfsense_backend/app/tasks/document_processors/init.py
+++ b/surfsense_backend/app/tasks/document_processors/init.py
@ -1,41 +1,17 @@
 """
 Document processors module for background tasks.
-This module provides a collection of document processors for different content types
+Content extraction is handled by ``app.etl_pipeline.EtlPipelineService``.
-and sources. Each processor is responsible for handling a specific type of document
+This package keeps orchestration (save, notify, page-limit) and
-processing task in the background.
+non-ETL processors (extension, markdown, youtube).
 Available processors:
 - Extension processor: Handle documents from browser extension
 - Markdown processor: Process markdown files
 - File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
 - YouTube processor: Process YouTube videos and extract transcripts
 """
 # Extension processor
 # File processors (backward-compatible re-exports from _save)
 from ._save import (
    add_received_file_document_using_docling,
    add_received_file_document_using_llamacloud,
    add_received_file_document_using_unstructured,
 )
 from .extension_processor import add_extension_received_document
 # Markdown processor
 from .markdown_processor import add_received_markdown_file_document
 # YouTube processor
 from .youtube_processor import add_youtube_video_document
 __all__ = [
    # Extension processing
    "add_extension_received_document",
    # File processing with different ETL services
    "add_received_file_document_using_docling",
    "add_received_file_document_using_llamacloud",
    "add_received_file_document_using_unstructured",
    # Markdown file processing
    "add_received_markdown_file_document",
    # YouTube video processing
    "add_youtube_video_document",
 ]
--- a/surfsense_backend/app/tasks/document_processors/_constants.py
+++ b/surfsense_backend/app/tasks/document_processors/_constants.py
@ -1,74 +0,0 @@
 """
 Constants for file document processing.
 Centralizes file type classification, LlamaCloud retry configuration,
 and timeout calculation parameters.
 """
 import ssl
 from enum import Enum
 import httpx
 # ---------------------------------------------------------------------------
 # File type classification
 # ---------------------------------------------------------------------------
 MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
 AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
 DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
 class FileCategory(Enum):
    MARKDOWN = "markdown"
    AUDIO = "audio"
    DIRECT_CONVERT = "direct_convert"
    DOCUMENT = "document"
 def classify_file(filename: str) -> FileCategory:
    """Classify a file by its extension into a processing category."""
    lower = filename.lower()
    if lower.endswith(MARKDOWN_EXTENSIONS):
        return FileCategory.MARKDOWN
    if lower.endswith(AUDIO_EXTENSIONS):
        return FileCategory.AUDIO
    if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
        return FileCategory.DIRECT_CONVERT
    return FileCategory.DOCUMENT
 # ---------------------------------------------------------------------------
 # LlamaCloud retry configuration
 # ---------------------------------------------------------------------------
 LLAMACLOUD_MAX_RETRIES = 5
 LLAMACLOUD_BASE_DELAY = 10  # seconds (exponential backoff base)
 LLAMACLOUD_MAX_DELAY = 120  # max delay between retries (2 minutes)
 LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
    ssl.SSLError,
    httpx.ConnectError,
    httpx.ConnectTimeout,
    httpx.ReadError,
    httpx.ReadTimeout,
    httpx.WriteError,
    httpx.WriteTimeout,
    httpx.RemoteProtocolError,
    httpx.LocalProtocolError,
    ConnectionError,
    ConnectionResetError,
    TimeoutError,
    OSError,
 )
 # ---------------------------------------------------------------------------
 # Timeout calculation constants
 # ---------------------------------------------------------------------------
 UPLOAD_BYTES_PER_SECOND_SLOW = (
    100 * 1024
 )  # 100 KB/s (conservative for slow connections)
 MIN_UPLOAD_TIMEOUT = 120  # Minimum 2 minutes for any file
 MAX_UPLOAD_TIMEOUT = 1800  # Maximum 30 minutes for very large files
 BASE_JOB_TIMEOUT = 600  # 10 minutes base for job processing
 PER_PAGE_JOB_TIMEOUT = 60  # 1 minute per page for processing
--- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@ -5,7 +5,7 @@ These converters handle file types that can be faithfully represented as
 markdown without any external ETL/OCR service:
 - CSV / TSV          → markdown table  (stdlib ``csv``)
- HTML / HTM → markdown        (``markdownify``)
+- HTML / HTM / XHTML → markdown        (``markdownify``)
 """
 from __future__ import annotations
@ -73,6 +73,7 @@ _CONVERTER_MAP: dict[str, Callable[..., str]] = {
    ".tsv": tsv_to_markdown,
    ".html": html_to_markdown,
    ".htm": html_to_markdown,
    ".xhtml": html_to_markdown,
 }
--- a/surfsense_backend/app/tasks/document_processors/_etl.py
+++ b/surfsense_backend/app/tasks/document_processors/_etl.py
@ -1,209 +0,0 @@
 """
 ETL parsing strategies for different document processing services.
 Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
 LlamaCloud retry logic and dynamic timeout calculations.
 """
 import asyncio
 import logging
 import os
 import random
 import warnings
 from logging import ERROR, getLogger
 import httpx
 from app.config import config as app_config
 from app.db import Log
 from app.services.task_logging_service import TaskLoggingService
 from ._constants import (
    LLAMACLOUD_BASE_DELAY,
    LLAMACLOUD_MAX_DELAY,
    LLAMACLOUD_MAX_RETRIES,
    LLAMACLOUD_RETRYABLE_EXCEPTIONS,
    PER_PAGE_JOB_TIMEOUT,
 )
 from ._helpers import calculate_job_timeout, calculate_upload_timeout
 # ---------------------------------------------------------------------------
 # LlamaCloud parsing with retry
 # ---------------------------------------------------------------------------
 async def parse_with_llamacloud_retry(
    file_path: str,
    estimated_pages: int,
    task_logger: TaskLoggingService | None = None,
    log_entry: Log | None = None,
 ):
    """
    Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
    Uses dynamic timeout calculations based on file size and page count to handle
    very large files reliably.
    Returns:
        LlamaParse result object
    Raises:
        Exception: If all retries fail
    """
    from llama_cloud_services import LlamaParse
    from llama_cloud_services.parse.utils import ResultType
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
    upload_timeout = calculate_upload_timeout(file_size_bytes)
    job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
    custom_timeout = httpx.Timeout(
        connect=120.0,
        read=upload_timeout,
        write=upload_timeout,
        pool=120.0,
    )
    logging.info(
        f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
        f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
        f"job_timeout={job_timeout:.0f}s"
    )
    last_exception = None
    attempt_errors: list[str] = []
    for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
        try:
            async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
                parser = LlamaParse(
                    api_key=app_config.LLAMA_CLOUD_API_KEY,
                    num_workers=1,
                    verbose=True,
                    language="en",
                    result_type=ResultType.MD,
                    max_timeout=int(max(2000, job_timeout + upload_timeout)),
                    job_timeout_in_seconds=job_timeout,
                    job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
                    custom_client=custom_client,
                )
                result = await parser.aparse(file_path)
                if attempt > 1:
                    logging.info(
                        f"LlamaCloud upload succeeded on attempt {attempt} after "
                        f"{len(attempt_errors)} failures"
                    )
                return result
        except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
            last_exception = e
            error_type = type(e).__name__
            error_msg = str(e)[:200]
            attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
            if attempt < LLAMACLOUD_MAX_RETRIES:
                base_delay = min(
                    LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
                    LLAMACLOUD_MAX_DELAY,
                )
                jitter = base_delay * 0.25 * (2 * random.random() - 1)
                delay = base_delay + jitter
                if task_logger and log_entry:
                    await task_logger.log_task_progress(
                        log_entry,
                        f"LlamaCloud upload failed "
                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
                        f"retrying in {delay:.0f}s",
                        {
                            "error_type": error_type,
                            "error_message": error_msg,
                            "attempt": attempt,
                            "retry_delay": delay,
                            "file_size_mb": round(file_size_mb, 1),
                            "upload_timeout": upload_timeout,
                        },
                    )
                else:
                    logging.warning(
                        f"LlamaCloud upload failed "
                        f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
                        f"{error_type}. File: {file_size_mb:.1f}MB. "
                        f"Retrying in {delay:.0f}s..."
                    )
                await asyncio.sleep(delay)
            else:
                logging.error(
                    f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
                    f"attempts. File size: {file_size_mb:.1f}MB, "
                    f"Pages: {estimated_pages}. "
                    f"Errors: {'; '.join(attempt_errors)}"
                )
        except Exception:
            raise
    raise last_exception or RuntimeError(
        f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
        f"File size: {file_size_mb:.1f}MB"
    )
 # ---------------------------------------------------------------------------
 # Per-service parse functions
 # ---------------------------------------------------------------------------
 async def parse_with_unstructured(file_path: str):
    """
    Parse a file using the Unstructured ETL service.
    Returns:
        List of LangChain Document elements.
    """
    from langchain_unstructured import UnstructuredLoader
    loader = UnstructuredLoader(
        file_path,
        mode="elements",
        post_processors=[],
        languages=["eng"],
        include_orig_elements=False,
        include_metadata=False,
        strategy="auto",
    )
    return await loader.aload()
 async def parse_with_docling(file_path: str, filename: str) -> str:
    """
    Parse a file using the Docling ETL service (via the Docling service wrapper).
    Returns:
        Markdown content string.
    """
    from app.services.docling_service import create_docling_service
    docling_service = create_docling_service()
    pdfminer_logger = getLogger("pdfminer")
    original_level = pdfminer_logger.level
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
        warnings.filterwarnings(
            "ignore", message=".*Cannot set gray non-stroke color.*"
        )
        warnings.filterwarnings("ignore", message=".*invalid float value.*")
        pdfminer_logger.setLevel(ERROR)
        try:
            result = await docling_service.process_document(file_path, filename)
        finally:
            pdfminer_logger.setLevel(original_level)
    return result["content"]
--- a/surfsense_backend/app/tasks/document_processors/_helpers.py
+++ b/surfsense_backend/app/tasks/document_processors/_helpers.py
@ -11,13 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Document, DocumentStatus, DocumentType
 from app.utils.document_converters import generate_unique_identifier_hash
 from ._constants import (
    BASE_JOB_TIMEOUT,
    MAX_UPLOAD_TIMEOUT,
    MIN_UPLOAD_TIMEOUT,
    PER_PAGE_JOB_TIMEOUT,
    UPLOAD_BYTES_PER_SECOND_SLOW,
 )
 from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document,
@ -198,21 +191,3 @@ async def update_document_from_connector(
    if "connector_id" in connector:
        document.connector_id = connector["connector_id"]
    await session.commit()
 # ---------------------------------------------------------------------------
 # Timeout calculations
 # ---------------------------------------------------------------------------
 def calculate_upload_timeout(file_size_bytes: int) -> float:
    """Calculate upload timeout based on file size (conservative for slow connections)."""
    estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
    return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
 def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
    """Calculate job processing timeout based on page count and file size."""
    page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
    size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
    return max(page_based_timeout, size_based_timeout)
--- a/surfsense_backend/app/tasks/document_processors/_save.py
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@ -1,14 +1,9 @@
 """
 Unified document save/update logic for file processors.
 Replaces the three nearly-identical ``add_received_file_document_using_*``
 functions with a single ``save_file_document`` function plus thin wrappers
 for backward compatibility.
 """
 import logging
 from langchain_core.documents import Document as LangChainDocument
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
@ -207,79 +202,3 @@ async def save_file_document(
        raise RuntimeError(
            f"Failed to process file document using {etl_service}: {e!s}"
        ) from e
 # ---------------------------------------------------------------------------
 # Backward-compatible wrapper functions
 # ---------------------------------------------------------------------------
 async def add_received_file_document_using_unstructured(
    session: AsyncSession,
    file_name: str,
    unstructured_processed_elements: list[LangChainDocument],
    search_space_id: int,
    user_id: str,
    connector: dict | None = None,
    enable_summary: bool = True,
 ) -> Document | None:
    """Process and store a file document using the Unstructured service."""
    from app.utils.document_converters import convert_document_to_markdown
    markdown_content = await convert_document_to_markdown(
        unstructured_processed_elements
    )
    return await save_file_document(
        session,
        file_name,
        markdown_content,
        search_space_id,
        user_id,
        "UNSTRUCTURED",
        connector,
        enable_summary,
    )
 async def add_received_file_document_using_llamacloud(
    session: AsyncSession,
    file_name: str,
    llamacloud_markdown_document: str,
    search_space_id: int,
    user_id: str,
    connector: dict | None = None,
    enable_summary: bool = True,
 ) -> Document | None:
    """Process and store document content parsed by LlamaCloud."""
    return await save_file_document(
        session,
        file_name,
        llamacloud_markdown_document,
        search_space_id,
        user_id,
        "LLAMACLOUD",
        connector,
        enable_summary,
    )
 async def add_received_file_document_using_docling(
    session: AsyncSession,
    file_name: str,
    docling_markdown_document: str,
    search_space_id: int,
    user_id: str,
    connector: dict | None = None,
    enable_summary: bool = True,
 ) -> Document | None:
    """Process and store document content parsed by Docling."""
    return await save_file_document(
        session,
        file_name,
        docling_markdown_document,
        search_space_id,
        user_id,
        "DOCLING",
        connector,
        enable_summary,
    )
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -1,14 +1,8 @@
 """
 File document processors orchestrating content extraction and indexing.
-This module is the public entry point for file processing.  It delegates to
+Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and
-specialised sub-modules that each own a single concern:
+keeps only orchestration concerns (notifications, logging, page limits, saving).
 - ``_constants``          — file type classification and configuration constants
 - ``_helpers``            — document deduplication, migration, connector helpers
 - ``_direct_converters``  — lossless file-to-markdown for csv/tsv/html
 - ``_etl``               — ETL parsing strategies (Unstructured, LlamaCloud, Docling)
 - ``_save``              — unified document creation / update logic
 """
 from __future__ import annotations
@ -17,38 +11,19 @@ import contextlib
 import logging
 import os
 from dataclasses import dataclass, field
 from logging import ERROR, getLogger
 from fastapi import HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config as app_config
 from app.db import Document, Log, Notification
 from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
 from ._constants import FileCategory, classify_file
 from ._direct_converters import convert_file_directly
 from ._etl import (
    parse_with_docling,
    parse_with_llamacloud_retry,
    parse_with_unstructured,
 )
 from ._helpers import update_document_from_connector
-from ._save import (
+from ._save import save_file_document
    add_received_file_document_using_docling,
    add_received_file_document_using_llamacloud,
    add_received_file_document_using_unstructured,
    save_file_document,
 )
 from .markdown_processor import add_received_markdown_file_document
 # Re-export public API so existing ``from file_processors import …`` keeps working.
 __all__ = [
    "add_received_file_document_using_docling",
    "add_received_file_document_using_llamacloud",
    "add_received_file_document_using_unstructured",
    "parse_with_llamacloud_retry",
    "process_file_in_background",
    "process_file_in_background_with_document",
    "save_file_document",
@ -142,35 +117,31 @@ async def _log_page_divergence(
 # ===================================================================
-async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
+async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Read a markdown / text file and create or update a document."""
+    """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
-    await _notify(ctx, "parsing", "Reading file")
+    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
    await _notify(ctx, "parsing", "Processing file")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
-        f"Processing markdown/text file: {ctx.filename}",
+        f"Processing file: {ctx.filename}",
-        {"file_type": "markdown", "processing_stage": "reading_file"},
+        {"processing_stage": "extracting"},
    )
-    with open(ctx.file_path, encoding="utf-8") as f:
+    etl_result = await EtlPipelineService().extract(
-        markdown_content = f.read()
+        EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
    )
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    await _notify(ctx, "chunking")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Creating document from markdown content: {ctx.filename}",
        {
            "processing_stage": "creating_document",
            "content_length": len(markdown_content),
        },
    )
    result = await add_received_markdown_file_document(
        ctx.session,
        ctx.filename,
-        markdown_content,
+        etl_result.markdown_content,
        ctx.search_space_id,
        ctx.user_id,
        ctx.connector,
@ -181,179 +152,19 @@ async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
    if result:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
-            f"Successfully processed markdown file: {ctx.filename}",
+            f"Successfully processed file: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
-                "file_type": "markdown",
+                "file_type": etl_result.content_type,
                "etl_service": etl_result.etl_service,
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
-            f"Markdown file already exists (duplicate): {ctx.filename}",
+            f"File already exists (duplicate): {ctx.filename}",
-            {"duplicate_detected": True, "file_type": "markdown"},
+            {"duplicate_detected": True, "file_type": etl_result.content_type},
        )
    return result
 async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
    """Convert a text-based file (csv/tsv/html) to markdown without ETL."""
    await _notify(ctx, "parsing", "Converting file")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Direct-converting file to markdown: {ctx.filename}",
        {"file_type": "direct_convert", "processing_stage": "converting"},
    )
    markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    await _notify(ctx, "chunking")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Creating document from converted content: {ctx.filename}",
        {
            "processing_stage": "creating_document",
            "content_length": len(markdown_content),
        },
    )
    result = await add_received_markdown_file_document(
        ctx.session,
        ctx.filename,
        markdown_content,
        ctx.search_space_id,
        ctx.user_id,
        ctx.connector,
    )
    if ctx.connector:
        await update_document_from_connector(result, ctx.connector, ctx.session)
    if result:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully direct-converted file: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
                "file_type": "direct_convert",
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Direct-converted file already exists (duplicate): {ctx.filename}",
            {"duplicate_detected": True, "file_type": "direct_convert"},
        )
    return result
 async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
    """Transcribe an audio file and create or update a document."""
    await _notify(ctx, "parsing", "Transcribing audio")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Processing audio file for transcription: {ctx.filename}",
        {"file_type": "audio", "processing_stage": "starting_transcription"},
    )
    stt_service_type = (
        "local"
        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
        else "external"
    )
    if stt_service_type == "local":
        from app.services.stt_service import stt_service
        try:
            stt_result = stt_service.transcribe_file(ctx.file_path)
            transcribed_text = stt_result.get("text", "")
            if not transcribed_text:
                raise ValueError("Transcription returned empty text")
            transcribed_text = (
                f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
            )
        except Exception as e:
            raise HTTPException(
                status_code=422,
                detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
            ) from e
        await ctx.task_logger.log_task_progress(
            ctx.log_entry,
            f"Local STT transcription completed: {ctx.filename}",
            {
                "processing_stage": "local_transcription_complete",
                "language": stt_result.get("language"),
                "confidence": stt_result.get("language_probability"),
                "duration": stt_result.get("duration"),
            },
        )
    else:
        from litellm import atranscription
        with open(ctx.file_path, "rb") as audio_file:
            transcription_kwargs: dict = {
                "model": app_config.STT_SERVICE,
                "file": audio_file,
                "api_key": app_config.STT_SERVICE_API_KEY,
            }
            if app_config.STT_SERVICE_API_BASE:
                transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
            transcription_response = await atranscription(**transcription_kwargs)
            transcribed_text = transcription_response.get("text", "")
            if not transcribed_text:
                raise ValueError("Transcription returned empty text")
        transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Transcription completed, creating document: {ctx.filename}",
        {
            "processing_stage": "transcription_complete",
            "transcript_length": len(transcribed_text),
        },
    )
    await _notify(ctx, "chunking")
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    result = await add_received_markdown_file_document(
        ctx.session,
        ctx.filename,
        transcribed_text,
        ctx.search_space_id,
        ctx.user_id,
        ctx.connector,
    )
    if ctx.connector:
        await update_document_from_connector(result, ctx.connector, ctx.session)
    if result:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully transcribed and processed audio file: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
                "file_type": "audio",
                "transcript_length": len(transcribed_text),
                "stt_service": stt_service_type,
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Audio file transcript already exists (duplicate): {ctx.filename}",
            {"duplicate_detected": True, "file_type": "audio"},
        )
    return result
@ -363,279 +174,10 @@ async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
 # ---------------------------------------------------------------------------
 async def _etl_unstructured(
    ctx: _ProcessingContext,
    page_limit_service,
    estimated_pages: int,
 ) -> Document | None:
    """Parse and save via the Unstructured ETL service."""
    await _notify(ctx, "parsing", "Extracting content")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Processing file with Unstructured ETL: {ctx.filename}",
        {
            "file_type": "document",
            "etl_service": "UNSTRUCTURED",
            "processing_stage": "loading",
        },
    )
    docs = await parse_with_unstructured(ctx.file_path)
    await _notify(ctx, "chunking", chunks_count=len(docs))
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Unstructured ETL completed, creating document: {ctx.filename}",
        {"processing_stage": "etl_complete", "elements_count": len(docs)},
    )
    actual_pages = page_limit_service.estimate_pages_from_elements(docs)
    final_pages = max(estimated_pages, actual_pages)
    await _log_page_divergence(
        ctx.task_logger,
        ctx.log_entry,
        ctx.filename,
        estimated_pages,
        actual_pages,
        final_pages,
    )
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    result = await add_received_file_document_using_unstructured(
        ctx.session,
        ctx.filename,
        docs,
        ctx.search_space_id,
        ctx.user_id,
        ctx.connector,
        enable_summary=ctx.enable_summary,
    )
    if ctx.connector:
        await update_document_from_connector(result, ctx.connector, ctx.session)
    if result:
        await page_limit_service.update_page_usage(
            ctx.user_id, final_pages, allow_exceed=True
        )
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully processed file with Unstructured: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
                "file_type": "document",
                "etl_service": "UNSTRUCTURED",
                "pages_processed": final_pages,
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Document already exists (duplicate): {ctx.filename}",
            {
                "duplicate_detected": True,
                "file_type": "document",
                "etl_service": "UNSTRUCTURED",
            },
        )
    return result
 async def _etl_llamacloud(
    ctx: _ProcessingContext,
    page_limit_service,
    estimated_pages: int,
 ) -> Document | None:
    """Parse and save via the LlamaCloud ETL service."""
    await _notify(ctx, "parsing", "Extracting content")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Processing file with LlamaCloud ETL: {ctx.filename}",
        {
            "file_type": "document",
            "etl_service": "LLAMACLOUD",
            "processing_stage": "parsing",
            "estimated_pages": estimated_pages,
        },
    )
    raw_result = await parse_with_llamacloud_retry(
        file_path=ctx.file_path,
        estimated_pages=estimated_pages,
        task_logger=ctx.task_logger,
        log_entry=ctx.log_entry,
    )
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
    await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
        {
            "processing_stage": "parsing_complete",
            "documents_count": len(markdown_documents),
        },
    )
    if not markdown_documents:
        await ctx.task_logger.log_task_failure(
            ctx.log_entry,
            f"LlamaCloud parsing returned no documents: {ctx.filename}",
            "ETL service returned empty document list",
            {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
        )
        raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
    actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
    final_pages = max(estimated_pages, actual_pages)
    await _log_page_divergence(
        ctx.task_logger,
        ctx.log_entry,
        ctx.filename,
        estimated_pages,
        actual_pages,
        final_pages,
    )
    any_created = False
    last_doc: Document | None = None
    for doc in markdown_documents:
        doc_result = await add_received_file_document_using_llamacloud(
            ctx.session,
            ctx.filename,
            llamacloud_markdown_document=doc.text,
            search_space_id=ctx.search_space_id,
            user_id=ctx.user_id,
            connector=ctx.connector,
            enable_summary=ctx.enable_summary,
        )
        if doc_result:
            any_created = True
            last_doc = doc_result
    if any_created:
        await page_limit_service.update_page_usage(
            ctx.user_id, final_pages, allow_exceed=True
        )
        if ctx.connector:
            await update_document_from_connector(last_doc, ctx.connector, ctx.session)
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully processed file with LlamaCloud: {ctx.filename}",
            {
                "document_id": last_doc.id,
                "content_hash": last_doc.content_hash,
                "file_type": "document",
                "etl_service": "LLAMACLOUD",
                "pages_processed": final_pages,
                "documents_count": len(markdown_documents),
            },
        )
        return last_doc
    await ctx.task_logger.log_task_success(
        ctx.log_entry,
        f"Document already exists (duplicate): {ctx.filename}",
        {
            "duplicate_detected": True,
            "file_type": "document",
            "etl_service": "LLAMACLOUD",
            "documents_count": len(markdown_documents),
        },
    )
    return None
 async def _etl_docling(
    ctx: _ProcessingContext,
    page_limit_service,
    estimated_pages: int,
 ) -> Document | None:
    """Parse and save via the Docling ETL service."""
    await _notify(ctx, "parsing", "Extracting content")
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Processing file with Docling ETL: {ctx.filename}",
        {
            "file_type": "document",
            "etl_service": "DOCLING",
            "processing_stage": "parsing",
        },
    )
    content = await parse_with_docling(ctx.file_path, ctx.filename)
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    await ctx.task_logger.log_task_progress(
        ctx.log_entry,
        f"Docling parsing completed, creating document: {ctx.filename}",
        {"processing_stage": "parsing_complete", "content_length": len(content)},
    )
    actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
    final_pages = max(estimated_pages, actual_pages)
    await _log_page_divergence(
        ctx.task_logger,
        ctx.log_entry,
        ctx.filename,
        estimated_pages,
        actual_pages,
        final_pages,
    )
    await _notify(ctx, "chunking")
    result = await add_received_file_document_using_docling(
        ctx.session,
        ctx.filename,
        docling_markdown_document=content,
        search_space_id=ctx.search_space_id,
        user_id=ctx.user_id,
        connector=ctx.connector,
        enable_summary=ctx.enable_summary,
    )
    if result:
        await page_limit_service.update_page_usage(
            ctx.user_id, final_pages, allow_exceed=True
        )
        if ctx.connector:
            await update_document_from_connector(result, ctx.connector, ctx.session)
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully processed file with Docling: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
                "file_type": "document",
                "etl_service": "DOCLING",
                "pages_processed": final_pages,
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Document already exists (duplicate): {ctx.filename}",
            {
                "duplicate_detected": True,
                "file_type": "document",
                "etl_service": "DOCLING",
            },
        )
    return result
 async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Route a document file to the configured ETL service."""
+    """Route a document file to the configured ETL service via the unified pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
    from app.services.page_limit_service import PageLimitExceededError, PageLimitService
    page_limit_service = PageLimitService(ctx.session)
@ -665,16 +207,60 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
            os.unlink(ctx.file_path)
        raise HTTPException(status_code=403, detail=str(e)) from e
-    etl_dispatch = {
+    await _notify(ctx, "parsing", "Extracting content")
        "UNSTRUCTURED": _etl_unstructured,
        "LLAMACLOUD": _etl_llamacloud,
        "DOCLING": _etl_docling,
    }
    handler = etl_dispatch.get(app_config.ETL_SERVICE)
    if handler is None:
        raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
-    return await handler(ctx, page_limit_service, estimated_pages)
+    etl_result = await EtlPipelineService().extract(
        EtlRequest(
            file_path=ctx.file_path,
            filename=ctx.filename,
            estimated_pages=estimated_pages,
        )
    )
    with contextlib.suppress(Exception):
        os.unlink(ctx.file_path)
    await _notify(ctx, "chunking")
    result = await save_file_document(
        ctx.session,
        ctx.filename,
        etl_result.markdown_content,
        ctx.search_space_id,
        ctx.user_id,
        etl_result.etl_service,
        ctx.connector,
        enable_summary=ctx.enable_summary,
    )
    if result:
        await page_limit_service.update_page_usage(
            ctx.user_id, estimated_pages, allow_exceed=True
        )
        if ctx.connector:
            await update_document_from_connector(result, ctx.connector, ctx.session)
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Successfully processed file: {ctx.filename}",
            {
                "document_id": result.id,
                "content_hash": result.content_hash,
                "file_type": "document",
                "etl_service": etl_result.etl_service,
                "pages_processed": estimated_pages,
            },
        )
    else:
        await ctx.task_logger.log_task_success(
            ctx.log_entry,
            f"Document already exists (duplicate): {ctx.filename}",
            {
                "duplicate_detected": True,
                "file_type": "document",
                "etl_service": etl_result.etl_service,
            },
        )
    return result
 # ===================================================================
@ -706,15 +292,16 @@ async def process_file_in_background(
    )
    try:
-        category = classify_file(filename)
+        from app.etl_pipeline.file_classifier import (
            FileCategory as EtlFileCategory,
            classify_file as etl_classify,
        )
-        if category == FileCategory.MARKDOWN:
+        category = etl_classify(filename)
-            return await _process_markdown_upload(ctx)
+
-        if category == FileCategory.DIRECT_CONVERT:
+        if category == EtlFileCategory.DOCUMENT:
            return await _process_direct_convert_upload(ctx)
        if category == FileCategory.AUDIO:
            return await _process_audio_upload(ctx)
            return await _process_document_upload(ctx)
        return await _process_non_document_upload(ctx)
    except Exception as e:
        await session.rollback()
@ -758,201 +345,64 @@ async def _extract_file_content(
    Returns:
        Tuple of (markdown_content, etl_service_name).
    """
-    category = classify_file(filename)
+    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
    from app.etl_pipeline.file_classifier import (
        FileCategory,
        classify_file as etl_classify,
    )
    category = etl_classify(filename)
    estimated_pages = 0
    if category == FileCategory.MARKDOWN:
    if notification:
-            await NotificationService.document_processing.notify_processing_progress(
+        stage_messages = {
-                session,
+            FileCategory.PLAINTEXT: "Reading file",
-                notification,
+            FileCategory.DIRECT_CONVERT: "Converting file",
-                stage="parsing",
+            FileCategory.AUDIO: "Transcribing audio",
-                stage_message="Reading file",
+            FileCategory.UNSUPPORTED: "Unsupported file type",
-            )
+            FileCategory.DOCUMENT: "Extracting content",
        await task_logger.log_task_progress(
            log_entry,
            f"Processing markdown/text file: {filename}",
            {"file_type": "markdown", "processing_stage": "reading_file"},
        )
        with open(file_path, encoding="utf-8") as f:
            content = f.read()
        with contextlib.suppress(Exception):
            os.unlink(file_path)
        return content, "MARKDOWN"
    if category == FileCategory.DIRECT_CONVERT:
        if notification:
            await NotificationService.document_processing.notify_processing_progress(
                session,
                notification,
                stage="parsing",
                stage_message="Converting file",
            )
        await task_logger.log_task_progress(
            log_entry,
            f"Direct-converting file to markdown: {filename}",
            {"file_type": "direct_convert", "processing_stage": "converting"},
        )
        content = convert_file_directly(file_path, filename)
        with contextlib.suppress(Exception):
            os.unlink(file_path)
        return content, "DIRECT_CONVERT"
    if category == FileCategory.AUDIO:
        if notification:
            await NotificationService.document_processing.notify_processing_progress(
                session,
                notification,
                stage="parsing",
                stage_message="Transcribing audio",
            )
        await task_logger.log_task_progress(
            log_entry,
            f"Processing audio file for transcription: {filename}",
            {"file_type": "audio", "processing_stage": "starting_transcription"},
        )
        transcribed_text = await _transcribe_audio(file_path, filename)
        with contextlib.suppress(Exception):
            os.unlink(file_path)
        return transcribed_text, "AUDIO_TRANSCRIPTION"
    # Document file — use ETL service
    return await _extract_document_content(
        file_path,
        filename,
        session,
        user_id,
        task_logger,
        log_entry,
        notification,
    )
 async def _transcribe_audio(file_path: str, filename: str) -> str:
    """Transcribe an audio file and return formatted markdown text."""
    stt_service_type = (
        "local"
        if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
        else "external"
    )
    if stt_service_type == "local":
        from app.services.stt_service import stt_service
        result = stt_service.transcribe_file(file_path)
        text = result.get("text", "")
        if not text:
            raise ValueError("Transcription returned empty text")
    else:
        from litellm import atranscription
        with open(file_path, "rb") as audio_file:
            kwargs: dict = {
                "model": app_config.STT_SERVICE,
                "file": audio_file,
                "api_key": app_config.STT_SERVICE_API_KEY,
        }
-            if app_config.STT_SERVICE_API_BASE:
+        await NotificationService.document_processing.notify_processing_progress(
-                kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+            session,
-            response = await atranscription(**kwargs)
+            notification,
-            text = response.get("text", "")
+            stage="parsing",
-            if not text:
+            stage_message=stage_messages.get(category, "Processing"),
-                raise ValueError("Transcription returned empty text")
+        )
-    return f"# Transcription of {filename}\n\n{text}"
+    await task_logger.log_task_progress(
        log_entry,
        f"Processing {category.value} file: {filename}",
        {"file_type": category.value, "processing_stage": "extracting"},
    )
-
+    if category == FileCategory.DOCUMENT:
 async def _extract_document_content(
    file_path: str,
    filename: str,
    session: AsyncSession,
    user_id: str,
    task_logger: TaskLoggingService,
    log_entry: Log,
    notification: Notification | None,
 ) -> tuple[str, str]:
    """
    Parse a document file via the configured ETL service.
    Returns:
        Tuple of (markdown_content, etl_service_name).
    """
        from app.services.page_limit_service import PageLimitService
        page_limit_service = PageLimitService(session)
-
+        estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
    try:
        estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
    except Exception:
        file_size = os.path.getsize(file_path)
        estimated_pages = max(1, file_size // (80 * 1024))
        await page_limit_service.check_page_limit(user_id, estimated_pages)
-    etl_service = app_config.ETL_SERVICE
+    result = await EtlPipelineService().extract(
-    markdown_content: str | None = None
+        EtlRequest(
    if notification:
        await NotificationService.document_processing.notify_processing_progress(
            session,
            notification,
            stage="parsing",
            stage_message="Extracting content",
        )
    if etl_service == "UNSTRUCTURED":
        from app.utils.document_converters import convert_document_to_markdown
        docs = await parse_with_unstructured(file_path)
        markdown_content = await convert_document_to_markdown(docs)
        actual_pages = page_limit_service.estimate_pages_from_elements(docs)
        final_pages = max(estimated_pages, actual_pages)
        await page_limit_service.update_page_usage(
            user_id, final_pages, allow_exceed=True
        )
    elif etl_service == "LLAMACLOUD":
        raw_result = await parse_with_llamacloud_retry(
            file_path=file_path,
            filename=filename,
            estimated_pages=estimated_pages,
            task_logger=task_logger,
            log_entry=log_entry,
        )
        markdown_documents = await raw_result.aget_markdown_documents(
            split_by_page=False
    )
-        if not markdown_documents:
+
-            raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
+    if category == FileCategory.DOCUMENT:
        markdown_content = markdown_documents[0].text
        await page_limit_service.update_page_usage(
            user_id, estimated_pages, allow_exceed=True
        )
    elif etl_service == "DOCLING":
        getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
        getLogger("docling.document_converter").setLevel(ERROR)
        getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
            ERROR
        )
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        result = converter.convert(file_path)
        markdown_content = result.document.export_to_markdown()
        await page_limit_service.update_page_usage(
            user_id, estimated_pages, allow_exceed=True
        )
    else:
        raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
    with contextlib.suppress(Exception):
        os.unlink(file_path)
-    if not markdown_content:
+    if not result.markdown_content:
        raise RuntimeError(f"Failed to extract content from file: {filename}")
-    return markdown_content, etl_service
+    return result.markdown_content, result.etl_service
 async def process_file_in_background_with_document(
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -0,0 +1,124 @@
 """Per-parser document extension sets for the ETL pipeline.
 Every consumer (file_classifier, connector-level skip checks, ETL pipeline
 validation) imports from here so there is a single source of truth.
 Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
 """
 from pathlib import PurePosixPath
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
 DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
    {
        ".pdf",
        ".docx",
        ".xlsx",
        ".pptx",
        ".png",
        ".jpg",
        ".jpeg",
        ".tiff",
        ".tif",
        ".bmp",
        ".webp",
    }
 )
 LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
    {
        ".pdf",
        ".docx",
        ".doc",
        ".xlsx",
        ".xls",
        ".pptx",
        ".ppt",
        ".docm",
        ".dot",
        ".dotm",
        ".pptm",
        ".pot",
        ".potx",
        ".xlsm",
        ".xlsb",
        ".xlw",
        ".rtf",
        ".epub",
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".bmp",
        ".tiff",
        ".tif",
        ".webp",
        ".svg",
        ".odt",
        ".ods",
        ".odp",
        ".hwp",
        ".hwpx",
    }
 )
 UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
    {
        ".pdf",
        ".docx",
        ".doc",
        ".xlsx",
        ".xls",
        ".pptx",
        ".ppt",
        ".png",
        ".jpg",
        ".jpeg",
        ".bmp",
        ".tiff",
        ".tif",
        ".heic",
        ".rtf",
        ".epub",
        ".odt",
        ".eml",
        ".msg",
        ".p7s",
    }
 )
 # ---------------------------------------------------------------------------
 # Union (used by classify_file for routing) + service lookup
 # ---------------------------------------------------------------------------
 DOCUMENT_EXTENSIONS: frozenset[str] = (
    DOCLING_DOCUMENT_EXTENSIONS
    | LLAMAPARSE_DOCUMENT_EXTENSIONS
    | UNSTRUCTURED_DOCUMENT_EXTENSIONS
 )
 _SERVICE_MAP: dict[str, frozenset[str]] = {
    "DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
    "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
    "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
 }
 def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
    """Return the document extensions supported by *etl_service*.
    Falls back to the full union when the service is ``None`` or unknown.
    """
    return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
 def is_supported_document_extension(filename: str) -> bool:
    """Return True if the file's extension is in the supported document set."""
    suffix = PurePosixPath(filename).suffix.lower()
    return suffix in DOCUMENT_EXTENSIONS
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch):
    # -- LlamaParse mock (external API) --------------------------------
-    class _FakeMarkdownDoc:
+    async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
-        def __init__(self, text: str):
+        _reject_empty(file_path)
-            self.text = text
+        return _MOCK_ETL_MARKDOWN
    class _FakeLlamaParseResult:
        async def aget_markdown_documents(self, *, split_by_page=False):
            return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)]
    async def _fake_llamacloud_parse(**kwargs):
        _reject_empty(kwargs["file_path"])
        return _FakeLlamaParseResult()
    monkeypatch.setattr(
-        "app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
+        "app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud",
        _fake_llamacloud_parse,
    )
    # -- Docling mock (heavy library boundary) -------------------------
-    async def _fake_docling_parse(file_path: str, filename: str):
+    async def _fake_docling_parse(file_path: str, filename: str) -> str:
        _reject_empty(file_path)
        return _MOCK_ETL_MARKDOWN
    monkeypatch.setattr(
-        "app.tasks.document_processors.file_processors.parse_with_docling",
+        "app.etl_pipeline.parsers.docling.parse_with_docling",
        _fake_docling_parse,
    )
--- a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
+++ b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py
@ -124,7 +124,7 @@ async def test_composio_connector_without_account_id_returns_error(
    maker = make_session_factory(async_engine)
    async with maker() as session:
-        count, _skipped, error = await index_google_drive_files(
+        count, _skipped, error, _unsupported = await index_google_drive_files(
            session=session,
            connector_id=data["connector_id"],
            search_space_id=data["search_space_id"],
--- a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py
@ -0,0 +1,244 @@
 """Tests that each cloud connector's download_and_extract_content correctly
 produces markdown from a real file via the unified ETL pipeline.
 Only the cloud client is mocked (system boundary).  The ETL pipeline runs for
 real so we know the full path from "cloud gives us bytes" to "we get markdown
 back" actually works.
 """
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 pytestmark = pytest.mark.unit
 _TXT_CONTENT = "Hello from the cloud connector test."
 _CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 async def _write_file(dest_path: str, content: str) -> None:
    """Simulate a cloud client writing downloaded bytes to disk."""
    with open(dest_path, "w", encoding="utf-8") as f:
        f.write(content)
 def _make_download_side_effect(content: str):
    """Return an async side-effect that writes *content* to the dest path
    and returns ``None`` (success)."""
    async def _side_effect(*args):
        dest_path = args[-1]
        await _write_file(dest_path, content)
        return None
    return _side_effect
 # ===================================================================
 # Google Drive
 # ===================================================================
 class TestGoogleDriveContentExtraction:
    async def test_txt_file_returns_markdown(self):
        from app.connectors.google_drive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_TXT_CONTENT),
        )
        file = {"id": "f1", "name": "notes.txt", "mimeType": "text/plain"}
        markdown, metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert _TXT_CONTENT in markdown
        assert metadata["google_drive_file_id"] == "f1"
        assert metadata["google_drive_file_name"] == "notes.txt"
    async def test_csv_file_returns_markdown_table(self):
        from app.connectors.google_drive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_CSV_CONTENT),
        )
        file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert "Alice" in markdown
        assert "Bob" in markdown
        assert "|" in markdown
    async def test_download_error_returns_error_message(self):
        from app.connectors.google_drive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(return_value="Network timeout")
        file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert markdown is None
        assert error == "Network timeout"
 # ===================================================================
 # OneDrive
 # ===================================================================
 class TestOneDriveContentExtraction:
    async def test_txt_file_returns_markdown(self):
        from app.connectors.onedrive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_TXT_CONTENT),
        )
        file = {
            "id": "od-1",
            "name": "report.txt",
            "file": {"mimeType": "text/plain"},
        }
        markdown, metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert _TXT_CONTENT in markdown
        assert metadata["onedrive_file_id"] == "od-1"
        assert metadata["onedrive_file_name"] == "report.txt"
    async def test_csv_file_returns_markdown_table(self):
        from app.connectors.onedrive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_CSV_CONTENT),
        )
        file = {
            "id": "od-2",
            "name": "data.csv",
            "file": {"mimeType": "text/csv"},
        }
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert "Alice" in markdown
        assert "|" in markdown
    async def test_download_error_returns_error_message(self):
        from app.connectors.onedrive.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(return_value="403 Forbidden")
        file = {
            "id": "od-3",
            "name": "secret.txt",
            "file": {"mimeType": "text/plain"},
        }
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert markdown is None
        assert error == "403 Forbidden"
 # ===================================================================
 # Dropbox
 # ===================================================================
 class TestDropboxContentExtraction:
    async def test_txt_file_returns_markdown(self):
        from app.connectors.dropbox.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_TXT_CONTENT),
        )
        file = {
            "id": "dbx-1",
            "name": "memo.txt",
            ".tag": "file",
            "path_lower": "/memo.txt",
        }
        markdown, metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert _TXT_CONTENT in markdown
        assert metadata["dropbox_file_id"] == "dbx-1"
        assert metadata["dropbox_file_name"] == "memo.txt"
    async def test_csv_file_returns_markdown_table(self):
        from app.connectors.dropbox.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(
            side_effect=_make_download_side_effect(_CSV_CONTENT),
        )
        file = {
            "id": "dbx-2",
            "name": "data.csv",
            ".tag": "file",
            "path_lower": "/data.csv",
        }
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert error is None
        assert "Alice" in markdown
        assert "|" in markdown
    async def test_download_error_returns_error_message(self):
        from app.connectors.dropbox.content_extractor import (
            download_and_extract_content,
        )
        client = MagicMock()
        client.download_file_to_disk = AsyncMock(return_value="Rate limited")
        file = {
            "id": "dbx-3",
            "name": "big.txt",
            ".tag": "file",
            "path_lower": "/big.txt",
        }
        markdown, _metadata, error = await download_and_extract_content(client, file)
        assert markdown is None
        assert error == "Rate limited"
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@ -8,6 +8,10 @@ import pytest
 from app.db import DocumentType
 from app.tasks.connector_indexers.dropbox_indexer import (
    _download_files_parallel,
    _index_full_scan,
    _index_selected_files,
    _index_with_delta_sync,
    index_dropbox_files,
 )
 pytestmark = pytest.mark.unit
@ -234,3 +238,610 @@ async def test_heartbeat_fires_during_parallel_downloads(
    assert len(docs) == 3
    assert failed == 0
    assert len(heartbeat_calls) >= 1, "Heartbeat should have fired at least once"
 # ---------------------------------------------------------------------------
 # D1-D2: _index_full_scan tests
 # ---------------------------------------------------------------------------
 def _folder_dict(name: str) -> dict:
    return {".tag": "folder", "name": name}
@pytest.fixture
 def full_scan_mocks(mock_dropbox_client, monkeypatch):
    """Wire up mocks for _index_full_scan in isolation."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    mock_session = AsyncMock()
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
    mock_log_entry = MagicMock()
    skip_results: dict[str, tuple[bool, str | None]] = {}
    monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
    async def _fake_skip(session, file, search_space_id):
        from app.connectors.dropbox.file_types import should_skip_file as _skip
        item_skip, unsup_ext = _skip(file)
        if item_skip:
            if unsup_ext:
                return True, f"unsupported:{unsup_ext}"
            return True, "folder/non-downloadable"
        return skip_results.get(file.get("id", ""), (False, None))
    monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
    download_and_index_mock = AsyncMock(return_value=(0, 0))
    monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
    from app.services.page_limit_service import PageLimitService as _RealPLS
    mock_page_limit_instance = MagicMock()
    mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
    mock_page_limit_instance.update_page_usage = AsyncMock()
    class _MockPageLimitService:
        estimate_pages_from_metadata = staticmethod(
            _RealPLS.estimate_pages_from_metadata
        )
        def __init__(self, session):
            self.get_page_usage = mock_page_limit_instance.get_page_usage
            self.update_page_usage = mock_page_limit_instance.update_page_usage
    monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
    return {
        "dropbox_client": mock_dropbox_client,
        "session": mock_session,
        "task_logger": mock_task_logger,
        "log_entry": mock_log_entry,
        "skip_results": skip_results,
        "download_and_index_mock": download_and_index_mock,
    }
 async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    monkeypatch.setattr(
        _mod,
        "get_files_in_folder",
        AsyncMock(return_value=(page_files, None)),
    )
    return await _index_full_scan(
        mocks["dropbox_client"],
        mocks["session"],
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        "",
        "Root",
        mocks["task_logger"],
        mocks["log_entry"],
        max_files,
        enable_summary=True,
    )
 async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
    """Skipped files excluded, renames counted as indexed, new files downloaded."""
    page_files = [
        _folder_dict("SubFolder"),
        _make_file_dict("skip1", "unchanged.txt"),
        _make_file_dict("rename1", "renamed.txt"),
        _make_file_dict("new1", "new1.txt"),
        _make_file_dict("new2", "new2.txt"),
    ]
    full_scan_mocks["skip_results"]["skip1"] = (True, "unchanged")
    full_scan_mocks["skip_results"]["rename1"] = (
        True,
        "File renamed: 'old' -> 'renamed.txt'",
    )
    full_scan_mocks["download_and_index_mock"].return_value = (2, 0)
    indexed, skipped, _unsupported = await _run_full_scan(
        full_scan_mocks, monkeypatch, page_files
    )
    assert indexed == 3  # 1 renamed + 2 from batch
    assert skipped == 2  # 1 folder + 1 unchanged
    call_args = full_scan_mocks["download_and_index_mock"].call_args
    call_files = call_args[0][2]
    assert len(call_files) == 2
    assert {f["id"] for f in call_files} == {"new1", "new2"}
 async def test_full_scan_respects_max_files(full_scan_mocks, monkeypatch):
    """Only max_files non-folder items are considered."""
    page_files = [_make_file_dict(f"f{i}", f"file{i}.txt") for i in range(10)]
    full_scan_mocks["download_and_index_mock"].return_value = (3, 0)
    await _run_full_scan(full_scan_mocks, monkeypatch, page_files, max_files=3)
    call_files = full_scan_mocks["download_and_index_mock"].call_args[0][2]
    assert len(call_files) == 3
 # ---------------------------------------------------------------------------
 # D3-D5: _index_selected_files tests
 # ---------------------------------------------------------------------------
@pytest.fixture
 def selected_files_mocks(mock_dropbox_client, monkeypatch):
    """Wire up mocks for _index_selected_files tests."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    mock_session = AsyncMock()
    get_file_results: dict[str, tuple[dict | None, str | None]] = {}
    async def _fake_get_file(client, path):
        return get_file_results.get(path, (None, f"Not configured: {path}"))
    monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file)
    skip_results: dict[str, tuple[bool, str | None]] = {}
    async def _fake_skip(session, file, search_space_id):
        return skip_results.get(file["id"], (False, None))
    monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
    download_and_index_mock = AsyncMock(return_value=(0, 0))
    monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
    from app.services.page_limit_service import PageLimitService as _RealPLS
    mock_page_limit_instance = MagicMock()
    mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
    mock_page_limit_instance.update_page_usage = AsyncMock()
    class _MockPageLimitService:
        estimate_pages_from_metadata = staticmethod(
            _RealPLS.estimate_pages_from_metadata
        )
        def __init__(self, session):
            self.get_page_usage = mock_page_limit_instance.get_page_usage
            self.update_page_usage = mock_page_limit_instance.update_page_usage
    monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
    return {
        "dropbox_client": mock_dropbox_client,
        "session": mock_session,
        "get_file_results": get_file_results,
        "skip_results": skip_results,
        "download_and_index_mock": download_and_index_mock,
    }
 async def _run_selected(mocks, file_tuples):
    return await _index_selected_files(
        mocks["dropbox_client"],
        mocks["session"],
        file_tuples,
        connector_id=_CONNECTOR_ID,
        search_space_id=_SEARCH_SPACE_ID,
        user_id=_USER_ID,
        enable_summary=True,
    )
 async def test_selected_files_single_file_indexed(selected_files_mocks):
    selected_files_mocks["get_file_results"]["/report.pdf"] = (
        _make_file_dict("f1", "report.pdf"),
        None,
    )
    selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
    indexed, skipped, _unsupported, errors = await _run_selected(
        selected_files_mocks,
        [("/report.pdf", "report.pdf")],
    )
    assert indexed == 1
    assert skipped == 0
    assert errors == []
 async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
    selected_files_mocks["get_file_results"]["/first.txt"] = (
        _make_file_dict("f1", "first.txt"),
        None,
    )
    selected_files_mocks["get_file_results"]["/mid.txt"] = (None, "HTTP 404")
    selected_files_mocks["get_file_results"]["/third.txt"] = (
        _make_file_dict("f3", "third.txt"),
        None,
    )
    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
    indexed, skipped, _unsupported, errors = await _run_selected(
        selected_files_mocks,
        [
            ("/first.txt", "first.txt"),
            ("/mid.txt", "mid.txt"),
            ("/third.txt", "third.txt"),
        ],
    )
    assert indexed == 2
    assert skipped == 0
    assert len(errors) == 1
    assert "mid.txt" in errors[0]
 async def test_selected_files_skip_rename_counting(selected_files_mocks):
    for path, fid, fname in [
        ("/unchanged.txt", "s1", "unchanged.txt"),
        ("/renamed.txt", "r1", "renamed.txt"),
        ("/new1.txt", "n1", "new1.txt"),
        ("/new2.txt", "n2", "new2.txt"),
    ]:
        selected_files_mocks["get_file_results"][path] = (
            _make_file_dict(fid, fname),
            None,
        )
    selected_files_mocks["skip_results"]["s1"] = (True, "unchanged")
    selected_files_mocks["skip_results"]["r1"] = (
        True,
        "File renamed: 'old' -> 'renamed.txt'",
    )
    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
    indexed, skipped, _unsupported, errors = await _run_selected(
        selected_files_mocks,
        [
            ("/unchanged.txt", "unchanged.txt"),
            ("/renamed.txt", "renamed.txt"),
            ("/new1.txt", "new1.txt"),
            ("/new2.txt", "new2.txt"),
        ],
    )
    assert indexed == 3  # 1 renamed + 2 batch
    assert skipped == 1
    assert errors == []
    mock = selected_files_mocks["download_and_index_mock"]
    call_files = mock.call_args[0][2]
    assert len(call_files) == 2
    assert {f["id"] for f in call_files} == {"n1", "n2"}
 # ---------------------------------------------------------------------------
 # E1-E4: _index_with_delta_sync tests
 # ---------------------------------------------------------------------------
 async def test_delta_sync_deletions_call_remove_document(monkeypatch):
    """E1: deleted entries are processed via _remove_document."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    entries = [
        {
            ".tag": "deleted",
            "name": "gone.txt",
            "path_lower": "/gone.txt",
            "id": "id:del1",
        },
        {
            ".tag": "deleted",
            "name": "also_gone.pdf",
            "path_lower": "/also_gone.pdf",
            "id": "id:del2",
        },
    ]
    mock_client = MagicMock()
    mock_client.get_changes = AsyncMock(return_value=(entries, "new-cursor", None))
    remove_calls: list[str] = []
    async def _fake_remove(session, file_id, search_space_id):
        remove_calls.append(file_id)
    monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
    monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
    _indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync(
        mock_client,
        AsyncMock(),
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        "old-cursor",
        mock_task_logger,
        MagicMock(),
        max_files=500,
        enable_summary=True,
    )
    assert sorted(remove_calls) == ["id:del1", "id:del2"]
    assert cursor == "new-cursor"
 async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
    """E2: modified/new file entries go through skip filter then download+index."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    entries = [
        _make_file_dict("mod1", "modified1.txt"),
        _make_file_dict("mod2", "modified2.txt"),
    ]
    mock_client = MagicMock()
    mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
    monkeypatch.setattr(
        _mod, "_should_skip_file", AsyncMock(return_value=(False, None))
    )
    download_mock = AsyncMock(return_value=(2, 0))
    monkeypatch.setattr(_mod, "_download_and_index", download_mock)
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
        mock_client,
        AsyncMock(),
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        "cursor-v1",
        mock_task_logger,
        MagicMock(),
        max_files=500,
        enable_summary=True,
    )
    assert indexed == 2
    assert skipped == 0
    assert cursor == "cursor-v2"
    downloaded_files = download_mock.call_args[0][2]
    assert len(downloaded_files) == 2
    assert {f["id"] for f in downloaded_files} == {"mod1", "mod2"}
 async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
    """E3: deletions processed, then remaining upserts filtered and indexed."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    entries = [
        {
            ".tag": "deleted",
            "name": "removed.txt",
            "path_lower": "/removed.txt",
            "id": "id:del1",
        },
        {
            ".tag": "deleted",
            "name": "trashed.pdf",
            "path_lower": "/trashed.pdf",
            "id": "id:del2",
        },
        _make_file_dict("mod1", "updated.txt"),
        _make_file_dict("new1", "brandnew.docx"),
    ]
    mock_client = MagicMock()
    mock_client.get_changes = AsyncMock(return_value=(entries, "final-cursor", None))
    remove_calls: list[str] = []
    async def _fake_remove(session, file_id, search_space_id):
        remove_calls.append(file_id)
    monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
    monkeypatch.setattr(
        _mod, "_should_skip_file", AsyncMock(return_value=(False, None))
    )
    download_mock = AsyncMock(return_value=(2, 0))
    monkeypatch.setattr(_mod, "_download_and_index", download_mock)
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
        mock_client,
        AsyncMock(),
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        "old-cursor",
        mock_task_logger,
        MagicMock(),
        max_files=500,
        enable_summary=True,
    )
    assert sorted(remove_calls) == ["id:del1", "id:del2"]
    assert indexed == 2
    assert skipped == 0
    assert cursor == "final-cursor"
    downloaded_files = download_mock.call_args[0][2]
    assert {f["id"] for f in downloaded_files} == {"mod1", "new1"}
 async def test_delta_sync_returns_new_cursor(monkeypatch):
    """E4: the new cursor from the API response is returned."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    mock_client = MagicMock()
    mock_client.get_changes = AsyncMock(return_value=([], "brand-new-cursor-xyz", None))
    monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
    indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
        mock_client,
        AsyncMock(),
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        "old-cursor",
        mock_task_logger,
        MagicMock(),
        max_files=500,
        enable_summary=True,
    )
    assert cursor == "brand-new-cursor-xyz"
    assert indexed == 0
    assert skipped == 0
 # ---------------------------------------------------------------------------
 # F1-F3: index_dropbox_files orchestrator tests
 # ---------------------------------------------------------------------------
@pytest.fixture
 def orchestrator_mocks(monkeypatch):
    """Wire up mocks for index_dropbox_files orchestrator tests."""
    import app.tasks.connector_indexers.dropbox_indexer as _mod
    mock_connector = MagicMock()
    mock_connector.config = {"_token_encrypted": False}
    mock_connector.last_indexed_at = None
    mock_connector.enable_summary = True
    monkeypatch.setattr(
        _mod,
        "get_connector_by_id",
        AsyncMock(return_value=mock_connector),
    )
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_start = AsyncMock(return_value=MagicMock())
    mock_task_logger.log_task_progress = AsyncMock()
    mock_task_logger.log_task_success = AsyncMock()
    mock_task_logger.log_task_failure = AsyncMock()
    monkeypatch.setattr(
        _mod, "TaskLoggingService", MagicMock(return_value=mock_task_logger)
    )
    monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock())
    full_scan_mock = AsyncMock(return_value=(5, 2, 0))
    monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock)
    delta_sync_mock = AsyncMock(return_value=(3, 1, 0, "delta-cursor-new"))
    monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock)
    mock_client = MagicMock()
    mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
    monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client))
    return {
        "connector": mock_connector,
        "full_scan_mock": full_scan_mock,
        "delta_sync_mock": delta_sync_mock,
        "mock_client": mock_client,
    }
 async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
    orchestrator_mocks,
 ):
    """F1: with cursor + last_indexed_at + use_delta_sync, calls delta sync."""
    from datetime import UTC, datetime
    connector = orchestrator_mocks["connector"]
    connector.config = {
        "_token_encrypted": False,
        "folder_cursors": {"/docs": "saved-cursor-123"},
    }
    connector.last_indexed_at = datetime(2026, 1, 1, tzinfo=UTC)
    mock_session = AsyncMock()
    mock_session.commit = AsyncMock()
    _indexed, _skipped, error, _unsupported = await index_dropbox_files(
        mock_session,
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        {
            "folders": [{"path": "/docs", "name": "Docs"}],
            "files": [],
            "indexing_options": {"use_delta_sync": True},
        },
    )
    assert error is None
    orchestrator_mocks["delta_sync_mock"].assert_called_once()
    orchestrator_mocks["full_scan_mock"].assert_not_called()
 async def test_orchestrator_falls_back_to_full_scan_without_cursor(
    orchestrator_mocks,
 ):
    """F2: without cursor, falls back to full scan."""
    connector = orchestrator_mocks["connector"]
    connector.config = {"_token_encrypted": False}
    connector.last_indexed_at = None
    mock_session = AsyncMock()
    mock_session.commit = AsyncMock()
    _indexed, _skipped, error, _unsupported = await index_dropbox_files(
        mock_session,
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        {
            "folders": [{"path": "/docs", "name": "Docs"}],
            "files": [],
            "indexing_options": {"use_delta_sync": True},
        },
    )
    assert error is None
    orchestrator_mocks["full_scan_mock"].assert_called_once()
    orchestrator_mocks["delta_sync_mock"].assert_not_called()
 async def test_orchestrator_persists_cursor_after_sync(orchestrator_mocks):
    """F3: after sync, persists new cursor to connector config."""
    connector = orchestrator_mocks["connector"]
    connector.config = {"_token_encrypted": False}
    connector.last_indexed_at = None
    mock_session = AsyncMock()
    mock_session.commit = AsyncMock()
    await index_dropbox_files(
        mock_session,
        _CONNECTOR_ID,
        _SEARCH_SPACE_ID,
        _USER_ID,
        {
            "folders": [{"path": "/docs", "name": "Docs"}],
            "files": [],
        },
    )
    assert "folder_cursors" in connector.config
    assert connector.config["folder_cursors"]["/docs"] == "latest-cursor-abc"
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
    full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
    full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
-    indexed, skipped = await _run_full_scan(full_scan_mocks)
+    indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks)
    assert indexed == 3  # 1 renamed + 2 from batch
    assert skipped == 1  # 1 unchanged
@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
-    indexed, skipped = await _index_with_delta_sync(
+    indexed, skipped, _unsupported = await _index_with_delta_sync(
        MagicMock(),
        mock_session,
        MagicMock(),
@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
    )
    selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
        selected_files_mocks,
        [("f1", "report.pdf")],
    )
@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
    )
    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
        selected_files_mocks,
        [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
    )
@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
    selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
-    indexed, skipped, errors = await _run_selected(
+    indexed, skipped, _unsup, errors = await _run_selected(
        selected_files_mocks,
        [
            ("s1", "unchanged.txt"),
--- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py
@ -198,7 +198,7 @@ async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks):
        )
    m["download_and_index_mock"].return_value = (3, 0)
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
        m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
    )
@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
        None,
    )
-    indexed, _skipped, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
        m, [("big", "huge.pdf")]
    )
    assert indexed == 0
    assert len(errors) == 1
@ -239,7 +241,7 @@ async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks):
        )
    m["download_and_index_mock"].return_value = (2, 0)
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
        m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
    )
@ -299,7 +301,7 @@ async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks):
            None,
        )
-    indexed, _skipped, errors = await _run_gdrive_selected(
+    indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
        m, [("f1", "f1.xyz"), ("f2", "f2.xyz")]
    )
@ -384,7 +386,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp
    m["download_mock"].return_value = ([], 0)
    m["batch_mock"].return_value = ([], 2, 0)
-    _indexed, skipped = await _run_gdrive_full_scan(m)
+    _indexed, skipped, _unsup = await _run_gdrive_full_scan(m)
    call_files = m["download_mock"].call_args[0][1]
    assert len(call_files) == 2
@ -459,7 +461,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
    mock_task_logger = MagicMock()
    mock_task_logger.log_task_progress = AsyncMock()
-    _indexed, skipped = await _mod._index_with_delta_sync(
+    _indexed, skipped, _unsupported = await _mod._index_with_delta_sync(
        MagicMock(),
        session,
        MagicMock(),
@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
        None,
    )
-    indexed, _skipped, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
+    indexed, _skipped, _unsup, errors = await _run_onedrive_selected(
        m, [("big", "huge.pdf")]
    )
    assert indexed == 0
    assert len(errors) == 1
@ -652,7 +656,7 @@ async def test_dropbox_over_quota_rejected(dropbox_selected_mocks):
        None,
    )
-    indexed, _skipped, errors = await _run_dropbox_selected(
+    indexed, _skipped, _unsup, errors = await _run_dropbox_selected(
        m, [("/huge.pdf", "huge.pdf")]
    )
--- a/surfsense_backend/tests/unit/connectors/init.py
+++ b/surfsense_backend/tests/unit/connectors/init.py
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py
@ -0,0 +1,123 @@
 """Tests for DropboxClient delta-sync methods (get_latest_cursor, get_changes)."""
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 from app.connectors.dropbox.client import DropboxClient
 pytestmark = pytest.mark.unit
 def _make_client() -> DropboxClient:
    """Create a DropboxClient with a mocked DB session so no real DB needed."""
    client = DropboxClient.__new__(DropboxClient)
    client._session = MagicMock()
    client._connector_id = 1
    return client
 # ---------- C1: get_latest_cursor ----------
 async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
    client = _make_client()
    fake_resp = MagicMock()
    fake_resp.status_code = 200
    fake_resp.json.return_value = {"cursor": "AAHbKxRZ9enq…"}
    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
    cursor, error = await client.get_latest_cursor("/my-folder")
    assert cursor == "AAHbKxRZ9enq…"
    assert error is None
    client._request.assert_called_once_with(
        "/2/files/list_folder/get_latest_cursor",
        {
            "path": "/my-folder",
            "recursive": False,
            "include_non_downloadable_files": True,
        },
    )
 # ---------- C2: get_changes returns entries and new cursor ----------
 async def test_get_changes_returns_entries_and_cursor(monkeypatch):
    client = _make_client()
    fake_resp = MagicMock()
    fake_resp.status_code = 200
    fake_resp.json.return_value = {
        "entries": [
            {".tag": "file", "name": "new.txt", "id": "id:abc"},
            {".tag": "deleted", "name": "old.txt"},
        ],
        "cursor": "cursor-v2",
        "has_more": False,
    }
    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
    entries, new_cursor, error = await client.get_changes("cursor-v1")
    assert error is None
    assert new_cursor == "cursor-v2"
    assert len(entries) == 2
    assert entries[0]["name"] == "new.txt"
    assert entries[1][".tag"] == "deleted"
 # ---------- C3: get_changes handles pagination ----------
 async def test_get_changes_handles_pagination(monkeypatch):
    client = _make_client()
    page1 = MagicMock()
    page1.status_code = 200
    page1.json.return_value = {
        "entries": [{".tag": "file", "name": "a.txt", "id": "id:a"}],
        "cursor": "cursor-page2",
        "has_more": True,
    }
    page2 = MagicMock()
    page2.status_code = 200
    page2.json.return_value = {
        "entries": [{".tag": "file", "name": "b.txt", "id": "id:b"}],
        "cursor": "cursor-final",
        "has_more": False,
    }
    request_mock = AsyncMock(side_effect=[page1, page2])
    monkeypatch.setattr(client, "_request", request_mock)
    entries, new_cursor, error = await client.get_changes("cursor-v1")
    assert error is None
    assert new_cursor == "cursor-final"
    assert len(entries) == 2
    assert {e["name"] for e in entries} == {"a.txt", "b.txt"}
    assert request_mock.call_count == 2
 # ---------- C4: get_changes raises on 401 ----------
 async def test_get_changes_returns_error_on_401(monkeypatch):
    client = _make_client()
    fake_resp = MagicMock()
    fake_resp.status_code = 401
    fake_resp.text = "Unauthorized"
    monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
    entries, new_cursor, error = await client.get_changes("old-cursor")
    assert error is not None
    assert "401" in error
    assert entries == []
    assert new_cursor is None
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py
@ -0,0 +1,173 @@
 """Tests for Dropbox file type filtering (should_skip_file)."""
 import pytest
 from app.connectors.dropbox.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 # ---------------------------------------------------------------------------
 # Structural skips (independent of ETL service)
 # ---------------------------------------------------------------------------
 def test_folder_item_is_skipped():
    item = {".tag": "folder", "name": "My Folder"}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 def test_paper_file_is_not_skipped():
    item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
    skip, ext = should_skip_file(item)
    assert skip is False
    assert ext is None
 def test_non_downloadable_item_is_skipped():
    item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 # ---------------------------------------------------------------------------
 # Extension-based skips (require ETL service context)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize(
    "filename",
    [
        "archive.zip",
        "backup.tar",
        "data.gz",
        "stuff.rar",
        "pack.7z",
        "program.exe",
        "lib.dll",
        "module.so",
        "image.dmg",
        "disk.iso",
        "movie.mov",
        "clip.avi",
        "video.mkv",
        "film.wmv",
        "stream.flv",
        "favicon.ico",
        "raw.cr2",
        "photo.nef",
        "image.arw",
        "pic.dng",
        "design.psd",
        "vector.ai",
        "mockup.sketch",
        "proto.fig",
        "font.ttf",
        "font.otf",
        "font.woff",
        "font.woff2",
        "model.stl",
        "scene.fbx",
        "mesh.blend",
        "local.db",
        "data.sqlite",
        "access.mdb",
    ],
 )
 def test_non_parseable_extensions_are_skipped(filename, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {".tag": "file", "name": filename}
    skip, ext = should_skip_file(item)
    assert skip is True, f"{filename} should be skipped"
    assert ext is not None
@pytest.mark.parametrize(
    "filename",
    [
        "report.pdf",
        "document.docx",
        "sheet.xlsx",
        "slides.pptx",
        "readme.txt",
        "data.csv",
        "page.html",
        "notes.md",
        "config.json",
        "feed.xml",
    ],
 )
 def test_parseable_documents_are_not_skipped(filename, mocker):
    """Files in plaintext/direct_convert/universal document sets are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {".tag": "file", "name": filename}
        skip, ext = should_skip_file(item)
        assert skip is False, f"{filename} should NOT be skipped with {service}"
        assert ext is None
@pytest.mark.parametrize(
    "filename",
    ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
 )
 def test_universal_images_are_not_skipped(filename, mocker):
    """Images supported by all parsers are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {".tag": "file", "name": filename}
        skip, ext = should_skip_file(item)
        assert skip is False, f"{filename} should NOT be skipped with {service}"
        assert ext is None
@pytest.mark.parametrize(
    "filename,service,expected_skip",
    [
        ("old.doc", "DOCLING", True),
        ("old.doc", "LLAMACLOUD", False),
        ("old.doc", "UNSTRUCTURED", False),
        ("legacy.xls", "DOCLING", True),
        ("legacy.xls", "LLAMACLOUD", False),
        ("legacy.xls", "UNSTRUCTURED", False),
        ("deck.ppt", "DOCLING", True),
        ("deck.ppt", "LLAMACLOUD", False),
        ("deck.ppt", "UNSTRUCTURED", False),
        ("icon.svg", "DOCLING", True),
        ("icon.svg", "LLAMACLOUD", False),
        ("anim.gif", "DOCLING", True),
        ("anim.gif", "LLAMACLOUD", False),
        ("photo.webp", "DOCLING", False),
        ("photo.webp", "LLAMACLOUD", False),
        ("photo.webp", "UNSTRUCTURED", True),
        ("live.heic", "DOCLING", True),
        ("live.heic", "UNSTRUCTURED", False),
        ("macro.docm", "DOCLING", True),
        ("macro.docm", "LLAMACLOUD", False),
        ("mail.eml", "DOCLING", True),
        ("mail.eml", "UNSTRUCTURED", False),
    ],
 )
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    item = {".tag": "file", "name": filename}
    skip, ext = should_skip_file(item)
    assert skip is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
    if expected_skip:
        assert ext is not None
    else:
        assert ext is None
 def test_returns_unsupported_extension(mocker):
    """When a file is skipped due to unsupported extension, the ext string is returned."""
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {".tag": "file", "name": "old.doc"}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext == ".doc"
--- a/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py
+++ b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py
@ -0,0 +1,43 @@
 """Test that Dropbox re-auth preserves folder_cursors in connector config."""
 import pytest
 pytestmark = pytest.mark.unit
 def test_reauth_preserves_folder_cursors():
    """G1: re-authentication preserves folder_cursors alongside cursor."""
    old_config = {
        "access_token": "old-token-enc",
        "refresh_token": "old-refresh-enc",
        "cursor": "old-cursor-abc",
        "folder_cursors": {"/docs": "cursor-docs-123", "/photos": "cursor-photos-456"},
        "_token_encrypted": True,
        "auth_expired": True,
    }
    new_connector_config = {
        "access_token": "new-token-enc",
        "refresh_token": "new-refresh-enc",
        "token_type": "bearer",
        "expires_in": 14400,
        "expires_at": "2026-04-06T16:00:00+00:00",
        "_token_encrypted": True,
    }
    existing_cursor = old_config.get("cursor")
    existing_folder_cursors = old_config.get("folder_cursors")
    merged_config = {
        **new_connector_config,
        "cursor": existing_cursor,
        "folder_cursors": existing_folder_cursors,
        "auth_expired": False,
    }
    assert merged_config["access_token"] == "new-token-enc"
    assert merged_config["cursor"] == "old-cursor-abc"
    assert merged_config["folder_cursors"] == {
        "/docs": "cursor-docs-123",
        "/photos": "cursor-photos-456",
    }
    assert merged_config["auth_expired"] is False
--- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py
@ -0,0 +1,80 @@
 """Tests for Google Drive file type filtering."""
 import pytest
 from app.connectors.google_drive.file_types import should_skip_by_extension
 pytestmark = pytest.mark.unit
@pytest.mark.parametrize(
    "filename",
    [
        "malware.exe",
        "archive.zip",
        "video.mov",
        "font.woff2",
        "model.blend",
    ],
 )
 def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
    """Truly unsupported files are skipped no matter which ETL service is configured."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        skip, _ext = should_skip_by_extension(filename)
        assert skip is True
@pytest.mark.parametrize(
    "filename",
    [
        "report.pdf",
        "doc.docx",
        "sheet.xlsx",
        "slides.pptx",
        "readme.txt",
        "data.csv",
        "photo.png",
        "notes.md",
    ],
 )
 def test_universal_extensions_are_not_skipped(filename, mocker):
    """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        skip, ext = should_skip_by_extension(filename)
        assert skip is False, f"{filename} should NOT be skipped with {service}"
        assert ext is None
@pytest.mark.parametrize(
    "filename,service,expected_skip",
    [
        ("macro.docm", "DOCLING", True),
        ("macro.docm", "LLAMACLOUD", False),
        ("mail.eml", "DOCLING", True),
        ("mail.eml", "UNSTRUCTURED", False),
        ("photo.gif", "DOCLING", True),
        ("photo.gif", "LLAMACLOUD", False),
        ("photo.heic", "UNSTRUCTURED", False),
        ("photo.heic", "DOCLING", True),
    ],
 )
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    skip, ext = should_skip_by_extension(filename)
    assert skip is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
    if expected_skip:
        assert ext is not None, "unsupported extension should be returned"
    else:
        assert ext is None
 def test_returns_unsupported_extension(mocker):
    """When a file is skipped, the unsupported extension string is returned."""
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    skip, ext = should_skip_by_extension("macro.docm")
    assert skip is True
    assert ext == ".docm"
--- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
+++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py
@ -0,0 +1,118 @@
 """Tests for OneDrive file type filtering."""
 import pytest
 from app.connectors.onedrive.file_types import should_skip_file
 pytestmark = pytest.mark.unit
 # ---------------------------------------------------------------------------
 # Structural skips (independent of ETL service)
 # ---------------------------------------------------------------------------
 def test_folder_is_skipped():
    item = {"folder": {}, "name": "My Folder"}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 def test_remote_item_is_skipped():
    item = {"remoteItem": {}, "name": "shared.docx"}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 def test_package_is_skipped():
    item = {"package": {}, "name": "notebook"}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 def test_onenote_is_skipped():
    item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext is None
 # ---------------------------------------------------------------------------
 # Extension-based skips (require ETL service context)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize(
    "filename",
    [
        "malware.exe",
        "archive.zip",
        "video.mov",
        "font.woff2",
        "model.blend",
    ],
 )
 def test_unsupported_extensions_are_skipped(filename, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
    skip, ext = should_skip_file(item)
    assert skip is True, f"{filename} should be skipped"
    assert ext is not None
@pytest.mark.parametrize(
    "filename",
    [
        "report.pdf",
        "doc.docx",
        "sheet.xlsx",
        "slides.pptx",
        "readme.txt",
        "data.csv",
        "photo.png",
        "notes.md",
    ],
 )
 def test_universal_files_are_not_skipped(filename, mocker):
    for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
        mocker.patch("app.config.config.ETL_SERVICE", service)
        item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
        skip, ext = should_skip_file(item)
        assert skip is False, f"{filename} should NOT be skipped with {service}"
        assert ext is None
@pytest.mark.parametrize(
    "filename,service,expected_skip",
    [
        ("macro.docm", "DOCLING", True),
        ("macro.docm", "LLAMACLOUD", False),
        ("mail.eml", "DOCLING", True),
        ("mail.eml", "UNSTRUCTURED", False),
        ("photo.heic", "UNSTRUCTURED", False),
        ("photo.heic", "DOCLING", True),
    ],
 )
 def test_parser_specific_extensions(filename, service, expected_skip, mocker):
    mocker.patch("app.config.config.ETL_SERVICE", service)
    item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
    skip, ext = should_skip_file(item)
    assert skip is expected_skip, (
        f"{filename} with {service}: expected skip={expected_skip}"
    )
    if expected_skip:
        assert ext is not None
    else:
        assert ext is None
 def test_returns_unsupported_extension(mocker):
    """When a file is skipped due to unsupported extension, the ext string is returned."""
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
    skip, ext = should_skip_file(item)
    assert skip is True
    assert ext == ".eml"
--- a/surfsense_backend/tests/unit/etl_pipeline/conftest.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py
@ -0,0 +1,27 @@
 """Pre-register the etl_pipeline package to avoid circular imports during unit tests."""
 import sys
 import types
 from pathlib import Path
 _BACKEND = Path(__file__).resolve().parents[3]
 def _stub_package(dotted: str, fs_dir: Path) -> None:
    if dotted not in sys.modules:
        mod = types.ModuleType(dotted)
        mod.__path__ = [str(fs_dir)]
        mod.__package__ = dotted
        sys.modules[dotted] = mod
    parts = dotted.split(".")
    if len(parts) > 1:
        parent_dotted = ".".join(parts[:-1])
        parent = sys.modules.get(parent_dotted)
        if parent is not None:
            setattr(parent, parts[-1], sys.modules[dotted])
 _stub_package("app", _BACKEND / "app")
 _stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
 _stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers")
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -0,0 +1,461 @@
 """Tests for EtlPipelineService -- the unified ETL pipeline public interface."""
 import pytest
 from app.etl_pipeline.etl_document import EtlRequest
 from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 pytestmark = pytest.mark.unit
 async def test_extract_txt_file_returns_markdown(tmp_path):
    """Tracer bullet: a .txt file is read and returned as-is in an EtlResult."""
    txt_file = tmp_path / "hello.txt"
    txt_file.write_text("Hello, world!", encoding="utf-8")
    service = EtlPipelineService()
    result = await service.extract(
        EtlRequest(file_path=str(txt_file), filename="hello.txt")
    )
    assert result.markdown_content == "Hello, world!"
    assert result.etl_service == "PLAINTEXT"
    assert result.content_type == "plaintext"
 async def test_extract_md_file(tmp_path):
    """A .md file is classified as PLAINTEXT and extracted."""
    md_file = tmp_path / "readme.md"
    md_file.write_text("# Title\n\nBody text.", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(md_file), filename="readme.md")
    )
    assert result.markdown_content == "# Title\n\nBody text."
    assert result.etl_service == "PLAINTEXT"
    assert result.content_type == "plaintext"
 async def test_extract_markdown_file(tmp_path):
    """A .markdown file is classified as PLAINTEXT and extracted."""
    md_file = tmp_path / "notes.markdown"
    md_file.write_text("Some notes.", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(md_file), filename="notes.markdown")
    )
    assert result.markdown_content == "Some notes."
    assert result.etl_service == "PLAINTEXT"
 async def test_extract_python_file(tmp_path):
    """A .py source code file is classified as PLAINTEXT."""
    py_file = tmp_path / "script.py"
    py_file.write_text("print('hello')", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(py_file), filename="script.py")
    )
    assert result.markdown_content == "print('hello')"
    assert result.etl_service == "PLAINTEXT"
    assert result.content_type == "plaintext"
 async def test_extract_js_file(tmp_path):
    """A .js source code file is classified as PLAINTEXT."""
    js_file = tmp_path / "app.js"
    js_file.write_text("console.log('hi');", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(js_file), filename="app.js")
    )
    assert result.markdown_content == "console.log('hi');"
    assert result.etl_service == "PLAINTEXT"
 async def test_extract_csv_returns_markdown_table(tmp_path):
    """A .csv file is converted to a markdown table."""
    csv_file = tmp_path / "data.csv"
    csv_file.write_text("name,age\nAlice,30\nBob,25\n", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(csv_file), filename="data.csv")
    )
    assert "| name | age |" in result.markdown_content
    assert "| Alice | 30 |" in result.markdown_content
    assert result.etl_service == "DIRECT_CONVERT"
    assert result.content_type == "direct_convert"
 async def test_extract_tsv_returns_markdown_table(tmp_path):
    """A .tsv file is converted to a markdown table."""
    tsv_file = tmp_path / "data.tsv"
    tsv_file.write_text("x\ty\n1\t2\n", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(tsv_file), filename="data.tsv")
    )
    assert "| x | y |" in result.markdown_content
    assert result.etl_service == "DIRECT_CONVERT"
 async def test_extract_html_returns_markdown(tmp_path):
    """An .html file is converted to markdown."""
    html_file = tmp_path / "page.html"
    html_file.write_text("<h1>Title</h1><p>Body</p>", encoding="utf-8")
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(html_file), filename="page.html")
    )
    assert "Title" in result.markdown_content
    assert "Body" in result.markdown_content
    assert result.etl_service == "DIRECT_CONVERT"
 async def test_extract_mp3_returns_transcription(tmp_path, mocker):
    """An .mp3 audio file is transcribed via litellm.atranscription."""
    audio_file = tmp_path / "recording.mp3"
    audio_file.write_bytes(b"\x00" * 100)
    mocker.patch("app.config.config.STT_SERVICE", "openai/whisper-1")
    mocker.patch("app.config.config.STT_SERVICE_API_KEY", "fake-key")
    mocker.patch("app.config.config.STT_SERVICE_API_BASE", None)
    mock_transcription = mocker.patch(
        "app.etl_pipeline.parsers.audio.atranscription",
        return_value={"text": "Hello from audio"},
    )
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(audio_file), filename="recording.mp3")
    )
    assert "Hello from audio" in result.markdown_content
    assert result.etl_service == "AUDIO"
    assert result.content_type == "audio"
    mock_transcription.assert_called_once()
 # ---------------------------------------------------------------------------
 # Slice 7 - DOCLING document parsing
 # ---------------------------------------------------------------------------
 async def test_extract_pdf_with_docling(tmp_path, mocker):
    """A .pdf file with ETL_SERVICE=DOCLING returns parsed markdown."""
    pdf_file = tmp_path / "report.pdf"
    pdf_file.write_bytes(b"%PDF-1.4 fake")
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    fake_docling = mocker.AsyncMock()
    fake_docling.process_document.return_value = {"content": "# Parsed PDF"}
    mocker.patch(
        "app.services.docling_service.create_docling_service",
        return_value=fake_docling,
    )
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
    )
    assert result.markdown_content == "# Parsed PDF"
    assert result.etl_service == "DOCLING"
    assert result.content_type == "document"
 # ---------------------------------------------------------------------------
 # Slice 8 - UNSTRUCTURED document parsing
 # ---------------------------------------------------------------------------
 async def test_extract_pdf_with_unstructured(tmp_path, mocker):
    """A .pdf file with ETL_SERVICE=UNSTRUCTURED returns parsed markdown."""
    pdf_file = tmp_path / "report.pdf"
    pdf_file.write_bytes(b"%PDF-1.4 fake")
    mocker.patch("app.config.config.ETL_SERVICE", "UNSTRUCTURED")
    class FakeDoc:
        def __init__(self, text):
            self.page_content = text
    fake_loader_instance = mocker.AsyncMock()
    fake_loader_instance.aload.return_value = [
        FakeDoc("Page 1 content"),
        FakeDoc("Page 2 content"),
    ]
    mocker.patch(
        "langchain_unstructured.UnstructuredLoader",
        return_value=fake_loader_instance,
    )
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
    )
    assert "Page 1 content" in result.markdown_content
    assert "Page 2 content" in result.markdown_content
    assert result.etl_service == "UNSTRUCTURED"
    assert result.content_type == "document"
 # ---------------------------------------------------------------------------
 # Slice 9 - LLAMACLOUD document parsing
 # ---------------------------------------------------------------------------
 async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
    """A .pdf file with ETL_SERVICE=LLAMACLOUD returns parsed markdown."""
    pdf_file = tmp_path / "report.pdf"
    pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
    mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
    mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
    class FakeDoc:
        text = "# LlamaCloud parsed"
    class FakeJobResult:
        pages = []
        def get_markdown_documents(self, split_by_page=True):
            return [FakeDoc()]
    fake_parser = mocker.AsyncMock()
    fake_parser.aparse.return_value = FakeJobResult()
    mocker.patch(
        "llama_cloud_services.LlamaParse",
        return_value=fake_parser,
    )
    mocker.patch(
        "llama_cloud_services.parse.utils.ResultType",
        mocker.MagicMock(MD="md"),
    )
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
    )
    assert result.markdown_content == "# LlamaCloud parsed"
    assert result.etl_service == "LLAMACLOUD"
    assert result.content_type == "document"
 # ---------------------------------------------------------------------------
 # Slice 10 - unknown extension falls through to document ETL
 # ---------------------------------------------------------------------------
 async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
    """An allowlisted document extension (.docx) routes to the document ETL path."""
    docx_file = tmp_path / "doc.docx"
    docx_file.write_bytes(b"PK fake docx")
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    fake_docling = mocker.AsyncMock()
    fake_docling.process_document.return_value = {"content": "Docx content"}
    mocker.patch(
        "app.services.docling_service.create_docling_service",
        return_value=fake_docling,
    )
    result = await EtlPipelineService().extract(
        EtlRequest(file_path=str(docx_file), filename="doc.docx")
    )
    assert result.markdown_content == "Docx content"
    assert result.content_type == "document"
 # ---------------------------------------------------------------------------
 # Slice 11 - EtlRequest validation
 # ---------------------------------------------------------------------------
 def test_etl_request_requires_filename():
    """EtlRequest rejects missing filename."""
    with pytest.raises(ValueError, match="filename must not be empty"):
        EtlRequest(file_path="/tmp/some.txt", filename="")
 # ---------------------------------------------------------------------------
 # Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError
 # ---------------------------------------------------------------------------
 async def test_unknown_etl_service_raises(tmp_path, mocker):
    """An unknown ETL_SERVICE raises EtlServiceUnavailableError."""
    from app.etl_pipeline.exceptions import EtlServiceUnavailableError
    pdf_file = tmp_path / "report.pdf"
    pdf_file.write_bytes(b"%PDF fake")
    mocker.patch("app.config.config.ETL_SERVICE", "NONEXISTENT")
    with pytest.raises(EtlServiceUnavailableError, match="Unknown ETL_SERVICE"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(pdf_file), filename="report.pdf")
        )
 # ---------------------------------------------------------------------------
 # Slice 13 - unsupported file types are rejected before reaching any parser
 # ---------------------------------------------------------------------------
 def test_unknown_extension_classified_as_unsupported():
    """An unknown extension defaults to UNSUPPORTED (allowlist behaviour)."""
    from app.etl_pipeline.file_classifier import FileCategory, classify_file
    assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
@pytest.mark.parametrize(
    "filename",
    [
        "malware.exe",
        "archive.zip",
        "video.mov",
        "font.woff2",
        "model.blend",
        "data.parquet",
        "package.deb",
        "firmware.bin",
    ],
 )
 def test_unsupported_extensions_classified_correctly(filename):
    """Extensions not in any allowlist are classified as UNSUPPORTED."""
    from app.etl_pipeline.file_classifier import FileCategory, classify_file
    assert classify_file(filename) == FileCategory.UNSUPPORTED
@pytest.mark.parametrize(
    "filename,expected",
    [
        ("report.pdf", "document"),
        ("doc.docx", "document"),
        ("slides.pptx", "document"),
        ("sheet.xlsx", "document"),
        ("photo.png", "document"),
        ("photo.jpg", "document"),
        ("book.epub", "document"),
        ("letter.odt", "document"),
        ("readme.md", "plaintext"),
        ("data.csv", "direct_convert"),
    ],
 )
 def test_parseable_extensions_classified_correctly(filename, expected):
    """Parseable files are classified into their correct category."""
    from app.etl_pipeline.file_classifier import FileCategory, classify_file
    result = classify_file(filename)
    assert result != FileCategory.UNSUPPORTED
    assert result.value == expected
 async def test_extract_unsupported_file_raises_error(tmp_path):
    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    exe_file = tmp_path / "program.exe"
    exe_file.write_bytes(b"\x00" * 10)
    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(exe_file), filename="program.exe")
        )
 async def test_extract_zip_raises_unsupported_error(tmp_path):
    """EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    zip_file = tmp_path / "archive.zip"
    zip_file.write_bytes(b"PK\x03\x04")
    with pytest.raises(EtlUnsupportedFileError, match="not supported"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(zip_file), filename="archive.zip")
        )
 # ---------------------------------------------------------------------------
 # Slice 14 - should_skip_for_service (per-parser document filtering)
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize(
    "filename,etl_service,expected_skip",
    [
        ("file.eml", "DOCLING", True),
        ("file.eml", "UNSTRUCTURED", False),
        ("file.docm", "LLAMACLOUD", False),
        ("file.docm", "DOCLING", True),
        ("file.txt", "DOCLING", False),
        ("file.csv", "LLAMACLOUD", False),
        ("file.mp3", "UNSTRUCTURED", False),
        ("file.exe", "LLAMACLOUD", True),
        ("file.pdf", "DOCLING", False),
        ("file.webp", "DOCLING", False),
        ("file.webp", "UNSTRUCTURED", True),
        ("file.gif", "LLAMACLOUD", False),
        ("file.gif", "DOCLING", True),
        ("file.heic", "UNSTRUCTURED", False),
        ("file.heic", "DOCLING", True),
        ("file.svg", "LLAMACLOUD", False),
        ("file.svg", "DOCLING", True),
        ("file.p7s", "UNSTRUCTURED", False),
        ("file.p7s", "LLAMACLOUD", True),
    ],
 )
 def test_should_skip_for_service(filename, etl_service, expected_skip):
    from app.etl_pipeline.file_classifier import should_skip_for_service
    assert should_skip_for_service(filename, etl_service) is expected_skip, (
        f"{filename} with {etl_service}: expected skip={expected_skip}"
    )
 # ---------------------------------------------------------------------------
 # Slice 14b - ETL pipeline rejects per-parser incompatible documents
 # ---------------------------------------------------------------------------
 async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
    """Docling cannot parse .docm -- pipeline should reject before dispatching."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    docm_file = tmp_path / "macro.docm"
    docm_file.write_bytes(b"\x00" * 10)
    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(docm_file), filename="macro.docm")
        )
 async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
    """Docling cannot parse .eml -- pipeline should reject before dispatching."""
    from app.etl_pipeline.exceptions import EtlUnsupportedFileError
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    eml_file = tmp_path / "mail.eml"
    eml_file.write_bytes(b"From: test@example.com")
    with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(eml_file), filename="mail.eml")
        )
--- a/surfsense_backend/tests/unit/services/init.py
+++ b/surfsense_backend/tests/unit/services/init.py
--- a/surfsense_backend/tests/unit/services/test_docling_image_support.py
+++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py
@ -0,0 +1,70 @@
 """Test that DoclingService does NOT restrict allowed_formats, letting Docling
 accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
 from enum import Enum
 from unittest.mock import MagicMock, patch
 import pytest
 pytestmark = pytest.mark.unit
 class _FakeInputFormat(Enum):
    PDF = "pdf"
    IMAGE = "image"
    DOCX = "docx"
    PPTX = "pptx"
    XLSX = "xlsx"
 def test_docling_service_does_not_restrict_allowed_formats():
    """DoclingService should NOT pass allowed_formats to DocumentConverter,
    so Docling defaults to accepting every InputFormat it supports."""
    mock_converter_cls = MagicMock()
    mock_backend = MagicMock()
    fake_pipeline_options_cls = MagicMock()
    fake_pipeline_options = MagicMock()
    fake_pipeline_options_cls.return_value = fake_pipeline_options
    fake_pdf_format_option_cls = MagicMock()
    with patch.dict(
        "sys.modules",
        {
            "docling": MagicMock(),
            "docling.backend": MagicMock(),
            "docling.backend.pypdfium2_backend": MagicMock(
                PyPdfiumDocumentBackend=mock_backend
            ),
            "docling.datamodel": MagicMock(),
            "docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat),
            "docling.datamodel.pipeline_options": MagicMock(
                PdfPipelineOptions=fake_pipeline_options_cls
            ),
            "docling.document_converter": MagicMock(
                DocumentConverter=mock_converter_cls,
                PdfFormatOption=fake_pdf_format_option_cls,
            ),
        },
    ):
        from importlib import reload
        import app.services.docling_service as mod
        reload(mod)
        mod.DoclingService()
    call_kwargs = mock_converter_cls.call_args
    assert call_kwargs is not None, "DocumentConverter was never called"
    _, kwargs = call_kwargs
    assert "allowed_formats" not in kwargs, (
        f"allowed_formats should not be passed — let Docling accept all formats. "
        f"Got: {kwargs.get('allowed_formats')}"
    )
    assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
        "format_options should still configure PDF pipeline options"
    )
--- a/surfsense_backend/tests/unit/utils/init.py
+++ b/surfsense_backend/tests/unit/utils/init.py
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@ -0,0 +1,154 @@
 """Tests for the DOCUMENT_EXTENSIONS allowlist module."""
 import pytest
 pytestmark = pytest.mark.unit
 def test_pdf_is_supported_document():
    from app.utils.file_extensions import is_supported_document_extension
    assert is_supported_document_extension("report.pdf") is True
 def test_exe_is_not_supported_document():
    from app.utils.file_extensions import is_supported_document_extension
    assert is_supported_document_extension("malware.exe") is False
@pytest.mark.parametrize(
    "filename",
    [
        "report.pdf",
        "doc.docx",
        "old.doc",
        "sheet.xlsx",
        "legacy.xls",
        "slides.pptx",
        "deck.ppt",
        "macro.docm",
        "macro.xlsm",
        "macro.pptm",
        "photo.png",
        "photo.jpg",
        "photo.jpeg",
        "scan.bmp",
        "scan.tiff",
        "scan.tif",
        "photo.webp",
        "anim.gif",
        "iphone.heic",
        "manual.rtf",
        "book.epub",
        "letter.odt",
        "data.ods",
        "presentation.odp",
        "inbox.eml",
        "outlook.msg",
        "korean.hwpx",
        "korean.hwp",
        "template.dot",
        "template.dotm",
        "template.pot",
        "template.potx",
        "binary.xlsb",
        "workspace.xlw",
        "vector.svg",
        "signature.p7s",
    ],
 )
 def test_document_extensions_are_supported(filename):
    from app.utils.file_extensions import is_supported_document_extension
    assert is_supported_document_extension(filename) is True, (
        f"{filename} should be supported"
    )
@pytest.mark.parametrize(
    "filename",
    [
        "malware.exe",
        "archive.zip",
        "video.mov",
        "font.woff2",
        "model.blend",
        "random.xyz",
        "data.parquet",
        "package.deb",
    ],
 )
 def test_non_document_extensions_are_not_supported(filename):
    from app.utils.file_extensions import is_supported_document_extension
    assert is_supported_document_extension(filename) is False, (
        f"{filename} should NOT be supported"
    )
 # ---------------------------------------------------------------------------
 # Per-parser extension sets
 # ---------------------------------------------------------------------------
 def test_union_equals_all_three_sets():
    from app.utils.file_extensions import (
        DOCLING_DOCUMENT_EXTENSIONS,
        DOCUMENT_EXTENSIONS,
        LLAMAPARSE_DOCUMENT_EXTENSIONS,
        UNSTRUCTURED_DOCUMENT_EXTENSIONS,
    )
    expected = (
        DOCLING_DOCUMENT_EXTENSIONS
        | LLAMAPARSE_DOCUMENT_EXTENSIONS
        | UNSTRUCTURED_DOCUMENT_EXTENSIONS
    )
    assert expected == DOCUMENT_EXTENSIONS
 def test_get_extensions_for_docling():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("DOCLING")
    assert ".pdf" in exts
    assert ".webp" in exts
    assert ".docx" in exts
    assert ".eml" not in exts
    assert ".docm" not in exts
    assert ".gif" not in exts
    assert ".heic" not in exts
 def test_get_extensions_for_llamacloud():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("LLAMACLOUD")
    assert ".docm" in exts
    assert ".gif" in exts
    assert ".svg" in exts
    assert ".hwp" in exts
    assert ".eml" not in exts
    assert ".heic" not in exts
 def test_get_extensions_for_unstructured():
    from app.utils.file_extensions import get_document_extensions_for_service
    exts = get_document_extensions_for_service("UNSTRUCTURED")
    assert ".eml" in exts
    assert ".heic" in exts
    assert ".p7s" in exts
    assert ".docm" not in exts
    assert ".gif" not in exts
    assert ".svg" not in exts
 def test_get_extensions_for_none_returns_union():
    from app.utils.file_extensions import (
        DOCUMENT_EXTENSIONS,
        get_document_extensions_for_service,
    )
    assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx
@ -8,6 +8,7 @@ import { Button } from "@/components/ui/button";
 import { Checkbox } from "@/components/ui/checkbox";
 import { Input } from "@/components/ui/input";
 import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
 import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import type { DocumentTypeEnum } from "@/contracts/types/document.types";
 import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
@ -63,25 +64,47 @@ export function DocumentsFilters({
 	return (
 		<div className="flex select-none">
 			<div className="flex items-center gap-2 w-full">
-				{/* Type Filter */}
+				{/* Filter + New Folder Toggle Group */}
 				<ToggleGroup type="multiple" variant="outline" value={[]} className="overflow-visible">
 					{onCreateFolder && (
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<ToggleGroupItem
 									value="folder"
 									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
 									onClick={(e) => {
 										e.preventDefault();
 										onCreateFolder();
 									}}
 								>
 									<FolderPlus size={14} />
 								</ToggleGroupItem>
 							</TooltipTrigger>
 							<TooltipContent>New folder</TooltipContent>
 						</Tooltip>
 					)}
 					<Popover>
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<PopoverTrigger asChild>
-						<Button
+									<ToggleGroupItem
-							variant="outline"
+										value="filter"
-							size="icon"
+										className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar overflow-visible"
 							className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
 									>
 										<ListFilter size={14} />
 										{activeTypes.length > 0 && (
-								<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-primary text-[9px] font-medium text-primary-foreground">
+											<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-sidebar-border text-[9px] font-medium text-sidebar-foreground">
 												{activeTypes.length}
 											</span>
 										)}
-						</Button>
+									</ToggleGroupItem>
 								</PopoverTrigger>
-					<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="end">
+							</TooltipTrigger>
 							<TooltipContent>Filter by type</TooltipContent>
 						</Tooltip>
 						<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="start">
 							<div>
 							{/* Search input */}
 								<div className="p-2">
 									<div className="relative">
 										<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
@ -122,11 +145,9 @@ export function DocumentsFilters({
 													}
 												}}
 											>
 											{/* Icon */}
 												<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
 													{getDocumentTypeIcon(value, "h-4 w-4")}
 												</div>
 											{/* Text content */}
 												<div className="flex flex-col min-w-0 flex-1 gap-0.5">
 													<span className="text-[13px] font-medium text-foreground truncate leading-tight">
 														{getDocumentTypeLabel(value)}
@ -136,7 +157,6 @@ export function DocumentsFilters({
 														{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
 													</span>
 												</div>
 											{/* Checkbox */}
 												<Checkbox
 													id={`${id}-${i}`}
 													checked={activeTypes.includes(value)}
@ -147,25 +167,10 @@ export function DocumentsFilters({
 										))
 									)}
 								</div>
 							{activeTypes.length > 0 && (
 								<div className="px-3 pt-1.5 pb-1.5 border-t border-border dark:border-neutral-700">
 									<Button
 										variant="ghost"
 										size="sm"
 										className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground hover:bg-neutral-200 dark:hover:bg-neutral-700"
 										onClick={() => {
 											activeTypes.forEach((t) => {
 												onToggleType(t, false);
 											});
 										}}
 									>
 										Clear filters
 									</Button>
 								</div>
 							)}
 							</div>
 						</PopoverContent>
 					</Popover>
 				</ToggleGroup>
 				{/* Search Input */}
 				<div className="relative flex-1 min-w-0">
@ -197,23 +202,6 @@ export function DocumentsFilters({
 					)}
 				</div>
 				{/* New Folder Button */}
 				{onCreateFolder && (
 					<Tooltip>
 						<TooltipTrigger asChild>
 							<Button
 								variant="outline"
 								size="icon"
 								className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
 								onClick={onCreateFolder}
 							>
 								<FolderPlus size={14} />
 							</Button>
 						</TooltipTrigger>
 						<TooltipContent>New folder</TooltipContent>
 					</Tooltip>
 				)}
 				{/* Upload Button */}
 				<Button
 					data-joyride="upload-button"
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx
@ -9,7 +9,6 @@ import {
 } from "@/components/desktop/shortcut-recorder";
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
 import { Label } from "@/components/ui/label";
 import { Switch } from "@/components/ui/switch";
 import { Spinner } from "@/components/ui/spinner";
 import { useElectronAPI } from "@/hooks/use-platform";
@ -147,11 +146,7 @@ export function DesktopContent() {
 								Show suggestions while typing in other applications.
 							</p>
 						</div>
-						<Switch
+						<Switch id="autocomplete-toggle" checked={enabled} onCheckedChange={handleToggle} />
 							id="autocomplete-toggle"
 							checked={enabled}
 							onCheckedChange={handleToggle}
 						/>
 					</div>
 				</CardContent>
 			</Card>
--- a/surfsense_web/app/desktop/permissions/page.tsx
+++ b/surfsense_web/app/desktop/permissions/page.tsx
@ -1,7 +1,7 @@
 "use client";
 import { useEffect, useState } from "react";
 import { useRouter } from "next/navigation";
 import { useEffect, useState } from "react";
 import { Logo } from "@/components/Logo";
 import { Button } from "@/components/ui/button";
 import { Spinner } from "@/components/ui/spinner";
@ -18,7 +18,8 @@ const STEPS = [
 	{
 		id: "screen-recording",
 		title: "Screen Recording",
-		description: "Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
+		description:
 			"Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
 		action: "requestScreenRecording",
 		field: "screenRecording" as const,
 	},
@ -98,7 +99,8 @@ export default function DesktopPermissionsPage() {
 		);
 	}
-	const allGranted = permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
+	const allGranted =
 		permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
 	const handleRequest = async (action: string) => {
 		if (action === "requestScreenRecording") {
@ -175,7 +177,8 @@ export default function DesktopPermissionsPage() {
 											</p>
 										)}
 										<p className="text-xs text-muted-foreground">
-											If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and select it from Applications.
+											If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and
 											select it from Applications.
 										</p>
 									</div>
 								)}
--- a/surfsense_web/app/desktop/suggestion/layout.tsx
+++ b/surfsense_web/app/desktop/suggestion/layout.tsx
@ -4,10 +4,6 @@ export const metadata = {
 	title: "SurfSense Suggestion",
 };
-export default function SuggestionLayout({
+export default function SuggestionLayout({ children }: { children: React.ReactNode }) {
 	children,
 }: {
 	children: React.ReactNode;
 }) {
 	return <div className="suggestion-body">{children}</div>;
 }
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@ -103,13 +103,10 @@ export default function SuggestionPage() {
 				return;
 			}
-			const backendUrl =
+			const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
 				process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
 			try {
-				const response = await fetch(
+				const response = await fetch(`${backendUrl}/api/v1/autocomplete/vision/stream`, {
 					`${backendUrl}/api/v1/autocomplete/vision/stream`,
 					{
 					method: "POST",
 					headers: {
 						Authorization: `Bearer ${token}`,
@ -122,8 +119,7 @@ export default function SuggestionPage() {
 						window_title: windowTitle || "",
 					}),
 					signal: controller.signal,
-					},
+				});
 				);
 				if (!response.ok) {
 					setError(friendlyError(response.status));
@ -174,9 +170,7 @@ export default function SuggestionPage() {
 										return [...prev, { id, title, status, items }];
 									});
 								}
-							} catch {
+							} catch {}
 								continue;
 							}
 						}
 					}
 				}
@ -187,7 +181,7 @@ export default function SuggestionPage() {
 				setIsLoading(false);
 			}
 		},
-		[],
+		[]
 	);
 	useEffect(() => {
@ -269,10 +263,18 @@ export default function SuggestionPage() {
 		<div className="suggestion-tooltip">
 			<p className="suggestion-text">{suggestion}</p>
 			<div className="suggestion-actions">
-				<button className="suggestion-btn suggestion-btn-accept" onClick={handleAccept}>
+				<button
 					type="button"
 					className="suggestion-btn suggestion-btn-accept"
 					onClick={handleAccept}
 				>
 					Accept
 				</button>
-				<button className="suggestion-btn suggestion-btn-dismiss" onClick={handleDismiss}>
+				<button
 					type="button"
 					className="suggestion-btn suggestion-btn-dismiss"
 					onClick={handleDismiss}
 				>
 					Dismiss
 				</button>
 			</div>
--- a/surfsense_web/app/desktop/suggestion/suggestion.css
+++ b/surfsense_web/app/desktop/suggestion/suggestion.css
@ -83,7 +83,9 @@ body:has(.suggestion-body) {
 	font-weight: 500;
 	cursor: pointer;
 	line-height: 16px;
-  transition: background 0.15s, border-color 0.15s;
+	transition:
 		background 0.15s,
 		border-color 0.15s;
 }
 .suggestion-btn-accept {
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@ -216,7 +216,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 					onPointerDownOutside={(e) => {
 						if (pickerOpen) e.preventDefault();
 					}}
-					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5 select-none"
+					className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button>svg]:size-5 select-none"
 				>
 					<DialogTitle className="sr-only">Manage Connectors</DialogTitle>
 					{/* YouTube Crawler View - shown when adding YouTube videos */}
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx
@ -144,18 +144,14 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
 					type="button"
 					onClick={handleFormSubmit}
 					disabled={isSubmitting}
-					className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
+					className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
 				>
-					{isSubmitting ? (
+					<span className={isSubmitting ? "opacity-0" : ""}>
-						<>
+						{connectorType === "MCP_CONNECTOR"
-							<Spinner size="sm" className="mr-2" />
+							? "Connect"
-							Connecting
+							: `Connect ${getConnectorTypeDisplay(connectorType)}`}
-						</>
+					</span>
-					) : connectorType === "MCP_CONNECTOR" ? (
+					{isSubmitting && <Spinner size="sm" className="absolute" />}
 						"Connect"
 					) : (
 						`Connect ${getConnectorTypeDisplay(connectorType)}`
 					)}
 				</Button>
 			</div>
 		</div>
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@ -369,16 +369,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 								size="sm"
 								onClick={handleDisconnectConfirm}
 								disabled={isDisconnecting}
-								className="text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
+								className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
 							>
-								{isDisconnecting ? (
+								<span className={isDisconnecting ? "opacity-0" : ""}>Confirm Disconnect</span>
-									<>
+								{isDisconnecting && <Spinner size="sm" className="absolute" />}
 										<Spinner size="sm" className="mr-2" />
 										Disconnecting
 									</>
 								) : (
 									"Confirm Disconnect"
 								)}
 							</Button>
 							<Button
 								variant="ghost"
@ -415,16 +409,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 					<Button
 						onClick={onSave}
 						disabled={isSaving || isDisconnecting}
-						className="text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
+						className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
 					>
-						{isSaving ? (
+						<span className={isSaving ? "opacity-0" : ""}>Save Changes</span>
-							<>
+						{isSaving && <Spinner size="sm" className="absolute" />}
 								<Spinner size="sm" className="mr-2" />
 								Saving
 							</>
 						) : (
 							"Save Changes"
 						)}
 					</Button>
 				)}
 			</div>
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx
@ -1,6 +1,6 @@
 "use client";
-import { Cable } from "lucide-react";
+import { Search, Unplug } from "lucide-react";
 import type { FC } from "react";
 import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
 import { Button } from "@/components/ui/button";
@ -134,9 +134,17 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 	const hasActiveConnectors =
 		filteredOAuthConnectorTypes.length > 0 || filteredNonOAuthConnectors.length > 0;
 	const hasFilteredResults = hasActiveConnectors || standaloneDocuments.length > 0;
 	return (
 		<TabsContent value="active" className="m-0">
-			{hasSources ? (
+			{hasSources && !hasFilteredResults && searchQuery ? (
 				<div className="flex flex-col items-center justify-center py-20 text-center">
 					<Search className="size-8 text-muted-foreground mb-3" />
 					<p className="text-sm text-muted-foreground">No connectors found</p>
 					<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
 				</div>
 			) : hasSources ? (
 				<div className="space-y-6">
 					{/* Active Connectors Section */}
 					{hasActiveConnectors && (
@ -302,7 +310,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
 			) : (
 				<div className="flex flex-col items-center justify-center py-20 text-center">
 					<div className="flex h-16 w-16 items-center justify-center rounded-full bg-muted mb-4">
-						<Cable className="size-8 text-muted-foreground" />
+						<Unplug className="size-8 text-muted-foreground" />
 					</div>
 					<h4 className="text-lg font-semibold">No active sources</h4>
 					<p className="text-sm text-muted-foreground mt-1 max-w-[280px]">
--- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx
@ -1,5 +1,6 @@
 "use client";
 import { Search } from "lucide-react";
 import type { FC } from "react";
 import { EnumConnectorName } from "@/contracts/enums/connector";
 import type { SearchSourceConnector } from "@/contracts/types/connector.types";
@ -287,6 +288,18 @@ export const AllConnectorsTab: FC<AllConnectorsTabProps> = ({
 		moreIntegrationsOther.length > 0 ||
 		moreIntegrationsCrawlers.length > 0;
 	const hasAnyResults = hasDocumentFileConnectors || hasMoreIntegrations;
 	if (!hasAnyResults && searchQuery) {
 		return (
 			<div className="flex flex-col items-center justify-center py-20 text-center">
 				<Search className="size-8 text-muted-foreground mb-3" />
 				<p className="text-sm text-muted-foreground">No connectors found</p>
 				<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
 			</div>
 		);
 	}
 	return (
 		<div className="space-y-8">
 			{/* Document/Files Connectors */}
--- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx
@ -173,9 +173,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
 								<Plus className="size-3 text-primary" />
 							)}
 						</div>
-						<span className="text-xs sm:text-sm font-medium">
+						<span className="text-xs sm:text-sm font-medium">{buttonText}</span>
 							{isConnecting ? "Connecting" : buttonText}
 						</span>
 					</button>
 				</div>
 			</div>
--- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx
@ -335,16 +335,10 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
 				<Button
 					onClick={handleSubmit}
 					disabled={isSubmitting || isFetchingPlaylist || videoTags.length === 0}
-					className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
+					className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
 				>
-					{isSubmitting ? (
+					<span className={isSubmitting ? "opacity-0" : ""}>{t("submit")}</span>
-						<>
+					{isSubmitting && <Spinner size="sm" className="absolute" />}
 							<Spinner size="sm" className="mr-2" />
 							{t("processing")}
 						</>
 					) : (
 						t("submit")
 					)}
 				</Button>
 			</div>
 		</div>
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@ -125,18 +125,16 @@ const DocumentUploadPopupContent: FC<{
 				onPointerDownOutside={(e) => e.preventDefault()}
 				onInteractOutside={(e) => e.preventDefault()}
 				onEscapeKeyDown={(e) => e.preventDefault()}
-				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5"
+				className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
 			>
 				<DialogTitle className="sr-only">Upload Document</DialogTitle>
 				<div className="flex-1 min-h-0 overflow-y-auto overscroll-contain">
-					<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-4 sm:pt-5 pb-10">
+					<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
 						<div className="flex items-center gap-2 mb-1 pr-8 sm:pr-0">
-							<h2 className="text-base sm:text-lg font-semibold tracking-tight">
+							<h2 className="text-xl sm:text-3xl font-semibold tracking-tight">Upload Documents</h2>
 								Upload Documents
 							</h2>
 						</div>
-						<p className="text-xs sm:text-sm text-muted-foreground line-clamp-1">
+						<p className="text-xs sm:text-base text-muted-foreground/80 line-clamp-1">
 							Upload and sync your documents to your search space
 						</p>
 					</div>
--- a/surfsense_web/components/assistant-ui/image.tsx
+++ b/surfsense_web/components/assistant-ui/image.tsx
@ -3,10 +3,10 @@
 import type { ImageMessagePartComponent } from "@assistant-ui/react";
 import { cva, type VariantProps } from "class-variance-authority";
 import { ImageIcon, ImageOffIcon } from "lucide-react";
 import NextImage from "next/image";
 import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react";
 import { createPortal } from "react-dom";
 import { cn } from "@/lib/utils";
 import NextImage from 'next/image';
 const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", {
 	variants: {
--- a/surfsense_web/components/assistant-ui/thread-list.tsx
+++ b/surfsense_web/components/assistant-ui/thread-list.tsx
@ -241,9 +241,7 @@ const ThreadListItemComponent = memo(function ThreadListItemComponent({
 			<MessageSquareIcon className="size-4 shrink-0 text-muted-foreground" />
 			<div className="flex-1 min-w-0">
 				<p className="truncate text-sm font-medium">{thread.title || "New Chat"}</p>
-				<p className="truncate text-xs text-muted-foreground">
+				<p className="truncate text-xs text-muted-foreground">{relativeTime}</p>
 					{relativeTime}
 				</p>
 			</div>
 			<DropdownMenu>
 				<DropdownMenuTrigger asChild>
--- a/surfsense_web/components/assistant-ui/tool-fallback.tsx
+++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx
@ -26,7 +26,8 @@ export const ToolFallback: ToolCallMessagePartComponent = ({
 	);
 	const serializedResult = useMemo(
-		() => (result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null),
+		() =>
 			result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null,
 		[result]
 	);
--- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
+++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx
@ -1,6 +1,6 @@
 "use client";
-import { ArrowUp, Send, X } from "lucide-react";
+import { ArrowUp } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { Button } from "@/components/ui/button";
 import { Popover, PopoverAnchor, PopoverContent } from "@/components/ui/popover";
@ -307,7 +307,6 @@ export function CommentComposer({
 						onClick={onCancel}
 						disabled={isSubmitting}
 					>
 						<X className="mr-1 size-4" />
 						Cancel
 					</Button>
 				)}
@ -318,14 +317,7 @@ export function CommentComposer({
 					disabled={!canSubmit}
 					className={cn(!canSubmit && "opacity-50", compact && "size-8 shrink-0 rounded-full")}
 				>
-					{compact ? (
+					{compact ? <ArrowUp className="size-4" /> : submitLabel}
 						<ArrowUp className="size-4" />
 					) : (
 						<>
 							<Send className="mr-1 size-4" />
 							{submitLabel}
 						</>
 					)}
 				</Button>
 			</div>
 		</div>
--- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx
@ -1,6 +1,6 @@
 "use client";
-import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
+import { MoreHorizontal, PenLine, Trash2 } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import {
 	DropdownMenu,
@ -21,15 +21,15 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment
 				<Button
 					variant="ghost"
 					size="icon"
-					className="size-7 opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
+					className="size-7 text-muted-foreground opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
 				>
-					<MoreHorizontal className="size-4 text-muted-foreground" />
+					<MoreHorizontal className="size-4" />
 				</Button>
 			</DropdownMenuTrigger>
 			<DropdownMenuContent align="end">
 				{canEdit && (
 					<DropdownMenuItem onClick={onEdit}>
-						<Pencil className="mr-2 size-4" />
+						<PenLine className="mr-2 size-4" />
 						Edit
 					</DropdownMenuItem>
 				)}
--- a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
+++ b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx
@ -198,7 +198,7 @@ export function CommentItem({
 						<CommentComposer
 							members={members}
 							membersLoading={membersLoading}
-							placeholder="Edit your comment..."
+							placeholder="Edit your comment"
 							submitLabel="Save"
 							isSubmitting={isSubmitting}
 							onSubmit={handleEditSubmit}
--- a/surfsense_web/components/documents/DocumentNode.tsx
+++ b/surfsense_web/components/documents/DocumentNode.tsx
@ -106,7 +106,9 @@ export const DocumentNode = React.memo(function DocumentNode({
 	const isProcessing = statusState === "pending" || statusState === "processing";
 	const [dropdownOpen, setDropdownOpen] = useState(false);
 	const [exporting, setExporting] = useState<string | null>(null);
 	const [titleTooltipOpen, setTitleTooltipOpen] = useState(false);
 	const rowRef = useRef<HTMLDivElement>(null);
 	const titleRef = useRef<HTMLSpanElement>(null);
 	const handleExport = useCallback(
 		(format: string) => {
@ -118,6 +120,14 @@ export const DocumentNode = React.memo(function DocumentNode({
 		[doc, onExport]
 	);
 	const handleTitleTooltipOpenChange = useCallback((open: boolean) => {
 		if (open && titleRef.current) {
 			setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth);
 		} else {
 			setTitleTooltipOpen(false);
 		}
 	}, []);
 	const attachRef = useCallback(
 		(node: HTMLDivElement | null) => {
 			(rowRef as React.MutableRefObject<HTMLDivElement | null>).current = node;
@ -197,7 +207,20 @@ export const DocumentNode = React.memo(function DocumentNode({
 						);
 					})()}
-					<span className="flex-1 min-w-0 truncate">{doc.title}</span>
+					<Tooltip
 						delayDuration={600}
 						open={titleTooltipOpen}
 						onOpenChange={handleTitleTooltipOpenChange}
 					>
 						<TooltipTrigger asChild>
 							<span ref={titleRef} className="flex-1 min-w-0 truncate">
 								{doc.title}
 							</span>
 						</TooltipTrigger>
 						<TooltipContent side="bottom" className="max-w-xs break-words">
 							{doc.title}
 						</TooltipContent>
 					</Tooltip>
 					{getDocumentTypeIcon(
 						doc.document_type as DocumentTypeEnum,
@ -259,11 +282,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 									Versions
 								</DropdownMenuItem>
 							)}
-							<DropdownMenuItem
+							<DropdownMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
 								className="text-destructive focus:text-destructive"
 								disabled={isProcessing}
 								onClick={() => onDelete(doc)}
 							>
 								<Trash2 className="mr-2 h-4 w-4" />
 								Delete
 							</DropdownMenuItem>
@ -305,11 +324,7 @@ export const DocumentNode = React.memo(function DocumentNode({
 							Versions
 						</ContextMenuItem>
 					)}
-					<ContextMenuItem
+					<ContextMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
 						className="text-destructive focus:text-destructive"
 						disabled={isProcessing}
 						onClick={() => onDelete(doc)}
 					>
 						<Trash2 className="mr-2 h-4 w-4" />
 						Delete
 					</ContextMenuItem>
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@ -56,7 +56,6 @@ interface FolderNodeProps {
 	depth: number;
 	isExpanded: boolean;
 	isRenaming: boolean;
 	childCount: number;
 	selectionState: FolderSelectionState;
 	processingState: "idle" | "processing" | "failed";
 	onToggleSelect: (folderId: number, selectAll: boolean) => void;
@ -101,7 +100,6 @@ export const FolderNode = React.memo(function FolderNode({
 	depth,
 	isExpanded,
 	isRenaming,
 	childCount,
 	selectionState,
 	processingState,
 	onToggleSelect,
@ -336,12 +334,6 @@ export const FolderNode = React.memo(function FolderNode({
 						<span className="flex-1 min-w-0 truncate">{folder.name}</span>
 					)}
 					{!isRenaming && childCount > 0 && (
 						<span className="shrink-0 text-[10px] text-muted-foreground tabular-nums">
 							{childCount}
 						</span>
 					)}
 					{!isRenaming && (
 						<DropdownMenu>
 							<DropdownMenuTrigger asChild>
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@ -86,16 +86,6 @@ export function FolderTreeView({
 	const docsByFolder = useMemo(() => groupBy(documents, (d) => d.folderId ?? "root"), [documents]);
 	const folderChildCounts = useMemo(() => {
 		const counts: Record<number, number> = {};
 		for (const f of folders) {
 			const children = foldersByParent[f.id] ?? [];
 			const docs = docsByFolder[f.id] ?? [];
 			counts[f.id] = children.length + docs.length;
 		}
 		return counts;
 	}, [folders, foldersByParent, docsByFolder]);
 	const [openContextMenuId, setOpenContextMenuId] = useState<string | null>(null);
 	// Single subscription for rename state — derived boolean passed to each FolderNode
@ -106,14 +96,26 @@ export function FolderTreeView({
 	);
 	const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]);
 	const effectiveActiveTypes = useMemo(() => {
 		if (
 			activeTypes.includes("FILE" as DocumentTypeEnum) &&
 			!activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum)
 		) {
 			return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum];
 		}
 		return activeTypes;
 	}, [activeTypes]);
 	const hasDescendantMatch = useMemo(() => {
-		if (activeTypes.length === 0 && !searchQuery) return null;
+		if (effectiveActiveTypes.length === 0 && !searchQuery) return null;
 		const match: Record<number, boolean> = {};
 		function check(folderId: number): boolean {
 			if (match[folderId] !== undefined) return match[folderId];
 			const childDocs = (docsByFolder[folderId] ?? []).some(
-				(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
+				(d) =>
 					effectiveActiveTypes.length === 0 ||
 					effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 			);
 			if (childDocs) {
 				match[folderId] = true;
@ -134,7 +136,7 @@ export function FolderTreeView({
 			check(f.id);
 		}
 		return match;
-	}, [folders, docsByFolder, foldersByParent, activeTypes, searchQuery]);
+	}, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]);
 	const folderSelectionStates = useMemo(() => {
 		const states: Record<number, FolderSelectionState> = {};
@ -204,7 +206,9 @@ export function FolderTreeView({
 			? childFolders.filter((f) => hasDescendantMatch[f.id])
 			: childFolders;
 		const childDocs = (docsByFolder[key] ?? []).filter(
-			(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
+			(d) =>
 				effectiveActiveTypes.length === 0 ||
 				effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
 		);
 		const nodes: React.ReactNode[] = [];
@ -226,7 +230,6 @@ export function FolderTreeView({
 					depth={depth}
 					isExpanded={isExpanded}
 					isRenaming={renamingFolderId === f.id}
 					childCount={folderChildCounts[f.id] ?? 0}
 					selectionState={folderSelectionStates[f.id] ?? "none"}
 					processingState={folderProcessingStates[f.id] ?? "idle"}
 					onToggleSelect={onToggleFolderSelect}
@ -289,7 +292,7 @@ export function FolderTreeView({
 		);
 	}
-	if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) {
+	if (treeNodes.length === 0 && (effectiveActiveTypes.length > 0 || searchQuery)) {
 		return (
 			<div className="flex flex-1 flex-col items-center justify-center gap-3 px-4 py-12 text-muted-foreground">
 				<Search className="h-10 w-10" />
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@ -11,13 +11,12 @@ import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Alert, AlertDescription } from "@/components/ui/alert";
 import { Button } from "@/components/ui/button";
 import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
 import { Skeleton } from "@/components/ui/skeleton";
 import { useMediaQuery } from "@/hooks/use-media-query";
 import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
 const PlateEditor = dynamic(
 	() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
-	{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
+	{ ssr: false, loading: () => <EditorPanelSkeleton /> }
 );
 const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
--- a/surfsense_web/components/editor/plate-editor.tsx
+++ b/surfsense_web/components/editor/plate-editor.tsx
@ -158,17 +158,18 @@ export function PlateEditor({
 	// When not forced read-only, the user can toggle between editing/viewing.
 	const canToggleMode = !readOnly;
-	const contextProviderValue = useMemo(()=> ({
+	const contextProviderValue = useMemo(
 		() => ({
 			onSave,
 			hasUnsavedChanges,
 			isSaving,
 			canToggleMode,
-	}), [onSave, hasUnsavedChanges, isSaving, canToggleMode]);
+		}),
 		[onSave, hasUnsavedChanges, isSaving, canToggleMode]
 	);
 	return (
-		<EditorSaveContext.Provider
+		<EditorSaveContext.Provider value={contextProviderValue}>
 			value={contextProviderValue}
 		>
 			<Plate
 				editor={editor}
 				// Only pass readOnly as a controlled prop when forced (permanently read-only).
--- a/surfsense_web/components/homepage/use-cases-grid.tsx
+++ b/surfsense_web/components/homepage/use-cases-grid.tsx
@ -1,7 +1,7 @@
 "use client";
 import Image from 'next/image';
 import { AnimatePresence, motion } from "motion/react";
 import Image from "next/image";
 import { ExpandedGifOverlay, useExpandedGif } from "@/components/ui/expanded-gif-overlay";
 const useCases = [
@ -88,7 +88,7 @@ function UseCaseCard({
 							alt={title}
 							fill
 							className="rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
-						unoptimized={src.endsWith('.gif')}
+							unoptimized={src.endsWith(".gif")}
 						/>
 					</div>
 				</div>
--- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
+++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx
@ -347,7 +347,9 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 	// Navigation items
 	const navItems: NavItem[] = useMemo(
-		() => [
+		() =>
 			(
 				[
 					{
 						title: "Inbox",
 						url: "#inbox",
@ -355,27 +357,28 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
 						isActive: isInboxSidebarOpen,
 						badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined,
 					},
-			{
+					isMobile
 						? {
 								title: "Documents",
 								url: "#documents",
 								icon: SquareLibrary,
-				isActive: isMobile
+								isActive: isDocumentsSidebarOpen,
-					? isDocumentsSidebarOpen
+							}
-					: isDocumentsSidebarOpen && !isRightPanelCollapsed,
+						: null,
 			},
 					{
 						title: "Announcements",
 						url: "#announcements",
 						icon: Megaphone,
 						isActive: isAnnouncementsSidebarOpen,
-				badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
+						badge:
 							announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
 					},
-		],
+				] as (NavItem | null)[]
 			).filter((item): item is NavItem => item !== null),
 		[
 			isMobile,
 			isInboxSidebarOpen,
 			isDocumentsSidebarOpen,
 			isRightPanelCollapsed,
 			totalUnreadCount,
 			isAnnouncementsSidebarOpen,
 			announcementUnreadCount,
--- a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx
+++ b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx
@ -82,7 +82,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 	return (
 		<Dialog open={open} onOpenChange={handleOpenChange}>
-			<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 data-[state=open]:animate-none data-[state=closed]:animate-none">
+			<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 select-none data-[state=open]:animate-none data-[state=closed]:animate-none">
 				<DialogHeader className="space-y-2 pb-2">
 					<div className="flex items-center gap-2 sm:gap-3">
 						<div className="flex-1 min-w-0">
@ -107,7 +107,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 											placeholder={t("name_placeholder")}
 											{...field}
 											autoFocus
-											className="text-sm h-9 sm:h-10"
+											className="text-sm h-9 sm:h-10 select-text"
 										/>
 									</FormControl>
 									<FormMessage />
@ -130,7 +130,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
 										<Input
 											placeholder={t("description_placeholder")}
 											{...field}
-											className="text-sm h-9 sm:h-10"
+											className="text-sm h-9 sm:h-10 select-text"
 										/>
 									</FormControl>
 									<FormMessage />
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@ -10,7 +10,6 @@ import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
 import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
 import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
 import { Button } from "@/components/ui/button";
 import { Skeleton } from "@/components/ui/skeleton";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { DocumentsSidebar } from "../sidebar";
@ -27,7 +26,7 @@ const HitlEditPanelContent = dynamic(
 		import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({
 			default: m.HitlEditPanelContent,
 		})),
-	{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
+	{ ssr: false, loading: () => null }
 );
 const ReportPanelContent = dynamic(
@ -35,7 +34,7 @@ const ReportPanelContent = dynamic(
 		import("@/components/report-panel/report-panel").then((m) => ({
 			default: m.ReportPanelContent,
 		})),
-	{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
+	{ ssr: false, loading: () => null }
 );
 interface RightPanelProps {
@ -78,14 +77,14 @@ export function RightPanelExpandButton() {
 	if (!collapsed || !hasContent) return null;
 	return (
-		<div className="flex shrink-0 items-center px-1">
+		<div className="flex shrink-0 items-center px-0.5">
 			<Tooltip>
 				<TooltipTrigger asChild>
 					<Button
 						variant="ghost"
 						size="icon"
 						onClick={() => startTransition(() => setCollapsed(false))}
-						className="h-7 w-7 shrink-0"
+						className="h-8 w-8 shrink-0 -m-0.5"
 					>
 						<PanelRight className="h-4 w-4" />
 						<span className="sr-only">Expand panel</span>
--- a/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllPrivateChatsSidebar.tsx
@ -376,7 +376,7 @@ export function AllPrivateChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-										<Tooltip>
+										<Tooltip delayDuration={600}>
 											<TooltipTrigger asChild>
 												<button
 													type="button"
--- a/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/AllSharedChatsSidebar.tsx
@ -375,7 +375,7 @@ export function AllSharedChatsSidebarContent({
 											<span className="truncate">{thread.title || "New Chat"}</span>
 										</button>
 									) : (
-										<Tooltip>
+										<Tooltip delayDuration={600}>
 											<TooltipTrigger asChild>
 												<button
 													type="button"
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@ -530,7 +530,8 @@ export function DocumentsSidebar({
 	const typeCounts = useMemo(() => {
 		const counts: Partial<Record<string, number>> = {};
 		for (const d of treeDocuments) {
-			counts[d.document_type] = (counts[d.document_type] || 0) + 1;
+			const displayType = d.document_type === "LOCAL_FOLDER_FILE" ? "FILE" : d.document_type;
 			counts[displayType] = (counts[displayType] || 0) + 1;
 		}
 		return counts;
 	}, [treeDocuments]);
@ -745,7 +746,7 @@ export function DocumentsSidebar({
 				</button>
 			</div>
-			<div className="flex-1 min-h-0 overflow-x-hidden pt-0 flex flex-col">
+			<div className="flex-1 min-h-0 pt-0 flex flex-col">
 				<div className="px-4 pb-2">
 					<DocumentsFilters
 						typeCounts={typeCounts}
--- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx
@ -790,18 +790,6 @@ export function InboxSidebarContent({
 								</DropdownMenuContent>
 							</DropdownMenu>
 						)}
 						{isMobile ? (
 							<Button
 								variant="ghost"
 								size="icon"
 								className="h-7 w-7 rounded-full"
 								onClick={handleMarkAllAsRead}
 								disabled={totalUnreadCount === 0}
 							>
 								<CheckCheck className="h-4 w-4 text-muted-foreground" />
 								<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
 							</Button>
 						) : (
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<Button
@ -819,7 +807,6 @@ export function InboxSidebarContent({
 								{t("mark_all_read") || "Mark all as read"}
 							</TooltipContent>
 						</Tooltip>
 						)}
 					</div>
 				</div>
@ -932,30 +919,8 @@ export function InboxSidebarContent({
 									)}
 									style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }}
 								>
-									{isMobile ? (
+									{activeTab === "status" ? (
-										<button
+										<Tooltip delayDuration={600}>
 											type="button"
 											onClick={() => handleItemClick(item)}
 											disabled={isMarkingAsRead}
 											className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
 										>
 											<div className="shrink-0">{getStatusIcon(item)}</div>
 											<div className="flex-1 min-w-0 overflow-hidden">
 												<p
 													className={cn(
 														"text-xs font-medium line-clamp-2",
 														!item.read && "font-semibold"
 													)}
 												>
 													{item.title}
 												</p>
 												<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
 													{convertRenderedToDisplay(item.message)}
 												</p>
 											</div>
 										</button>
 									) : (
 										<Tooltip>
 											<TooltipTrigger asChild>
 												<button
 													type="button"
@ -986,6 +951,28 @@ export function InboxSidebarContent({
 												</p>
 											</TooltipContent>
 										</Tooltip>
 									) : (
 										<button
 											type="button"
 											onClick={() => handleItemClick(item)}
 											disabled={isMarkingAsRead}
 											className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
 										>
 											<div className="shrink-0">{getStatusIcon(item)}</div>
 											<div className="flex-1 min-w-0 overflow-hidden">
 												<p
 													className={cn(
 														"text-xs font-medium line-clamp-2",
 														!item.read && "font-semibold"
 													)}
 												>
 													{item.title}
 												</p>
 												<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
 													{convertRenderedToDisplay(item.message)}
 												</p>
 											</div>
 										</button>
 									)}
 									<div className="flex items-center justify-end gap-1.5 shrink-0 w-10">
--- a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx
@ -35,7 +35,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
 				<Progress value={usagePercentage} className="h-1.5" />
 				<Link
 					href={`/dashboard/${searchSpaceId}/more-pages`}
-					className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
+					className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
 				>
 					<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
 						<Zap className="h-3 w-3 shrink-0" />
@ -48,7 +48,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
 				{pageBuyingEnabled && (
 					<Link
 						href={`/dashboard/${searchSpaceId}/buy-pages`}
-						className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
+						className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
 					>
 						<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
 							<CreditCard className="h-3 w-3 shrink-0" />
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@ -2,9 +2,9 @@ import { createCodePlugin } from "@streamdown/code";
 import { createMathPlugin } from "@streamdown/math";
 import { Streamdown, type StreamdownProps } from "streamdown";
 import "katex/dist/katex.min.css";
 import { cn } from "@/lib/utils";
 import Image from 'next/image';
 import { is } from "drizzle-orm";
 import Image from "next/image";
 import { cn } from "@/lib/utils";
 const code = createCodePlugin({
 	themes: ["nord", "nord"],
@ -130,7 +130,8 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
 		),
 		hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
 		img: ({ src, alt, width: _w, height: _h, ...props }) => {
-    	const isDataOrUnknownUrl = typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
+			const isDataOrUnknownUrl =
 				typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
 			return isDataOrUnknownUrl ? (
 				// eslint-disable-next-line @next/next/no-img-element
@ -153,7 +154,7 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
 					{...props}
 				/>
 			);
-},
+		},
 		table: ({ ...props }) => (
 			<div className="overflow-x-auto my-4 rounded-lg border border-border w-full">
 				<table className="w-full divide-y divide-border" {...props} />
--- a/surfsense_web/components/new-chat/chat-share-button.tsx
+++ b/surfsense_web/components/new-chat/chat-share-button.tsx
@ -163,8 +163,6 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 			)}
 			<Popover open={open} onOpenChange={setOpen}>
 				<Tooltip>
 					<TooltipTrigger asChild>
 				<PopoverTrigger asChild>
 					<Button
 						variant="outline"
@ -175,9 +173,6 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
 						<span className="hidden md:inline text-sm">{buttonLabel}</span>
 					</Button>
 				</PopoverTrigger>
 					</TooltipTrigger>
 					<TooltipContent>Share settings</TooltipContent>
 				</Tooltip>
 				<PopoverContent
 					className="w-[280px] md:w-[320px] p-0 rounded-lg shadow-lg border-border/60 dark:bg-neutral-900 dark:border dark:border-white/5 select-none"
--- a/surfsense_web/components/new-chat/model-selector.tsx
+++ b/surfsense_web/components/new-chat/model-selector.tsx
@ -1,7 +1,7 @@
 "use client";
 import { useAtomValue } from "jotai";
-import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Zap } from "lucide-react";
+import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Search, Zap } from "lucide-react";
 import { type UIEvent, useCallback, useMemo, useState } from "react";
 import { toast } from "sonner";
 import {
@ -344,7 +344,7 @@ export function ModelSelector({
 							>
 								<CommandEmpty className="py-8 text-center">
 									<div className="flex flex-col items-center gap-2">
-										<Bot className="size-8 text-muted-foreground" />
+										<Search className="size-8 text-muted-foreground" />
 										<p className="text-sm text-muted-foreground">No models found</p>
 										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
 									</div>
@ -531,8 +531,9 @@ export function ModelSelector({
 							>
 								<CommandEmpty className="py-8 text-center">
 									<div className="flex flex-col items-center gap-2">
-										<ImageIcon className="size-8 text-muted-foreground" />
+										<Search className="size-8 text-muted-foreground" />
 										<p className="text-sm text-muted-foreground">No image models found</p>
 										<p className="text-xs text-muted-foreground/60">Try a different search term</p>
 									</div>
 								</CommandEmpty>
--- a/surfsense_web/components/settings/user-settings-dialog.tsx
+++ b/surfsense_web/components/settings/user-settings-dialog.tsx
@ -6,10 +6,10 @@ import { useTranslations } from "next-intl";
 import { useMemo } from "react";
 import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent";
 import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent";
 import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
 import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent";
 import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent";
 import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent";
 import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
 import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
 import { SettingsDialog } from "@/components/settings/settings-dialog";
 import { usePlatform } from "@/hooks/use-platform";
--- a/surfsense_web/components/shared/image-config-dialog.tsx
+++ b/surfsense_web/components/shared/image-config-dialog.tsx
@ -433,7 +433,7 @@ export function ImageConfigDialog({
 							className="relative text-sm h-9 min-w-[120px]"
 						>
 							<span className={isSubmitting ? "opacity-0" : ""}>
-								{mode === "edit" ? "Save Changes" : "Create & Use"}
+								{mode === "edit" ? "Save Changes" : "Add Model"}
 							</span>
 							{isSubmitting && <Spinner size="sm" className="absolute" />}
 						</Button>
--- a/surfsense_web/components/shared/model-config-dialog.tsx
+++ b/surfsense_web/components/shared/model-config-dialog.tsx
@ -312,7 +312,7 @@ export function ModelConfigDialog({
 							className="relative text-sm h-9 min-w-[120px]"
 						>
 							<span className={isSubmitting ? "opacity-0" : ""}>
-								{mode === "edit" ? "Save Changes" : "Create & Use"}
+								{mode === "edit" ? "Save Changes" : "Add Model"}
 							</span>
 							{isSubmitting && <Spinner size="sm" className="absolute" />}
 						</Button>
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@ -86,7 +86,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
 		"application/rtf": [".rtf"],
 		"application/xml": [".xml"],
 		"application/epub+zip": [".epub"],
 		"text/html": [".html", ".htm", ".web"],
 		"image/gif": [".gif"],
 		"image/svg+xml": [".svg"],
 		...audioFileTypes,
@ -470,8 +469,9 @@ export function DocumentUploadTab({
 						</button>
 					))
 				) : (
-					<div
+					<button
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer"
+						type="button"
 						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
 						onClick={() => {
 							if (!isElectron) fileInputRef.current?.click();
 						}}
@ -483,10 +483,16 @@ export function DocumentUploadTab({
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
-						<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}>
+						{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
 						<div
 							className="w-full mt-1"
 							onClick={(e) => e.stopPropagation()}
 							onKeyDown={(e) => e.stopPropagation()}
 							role="group"
 						>
 							{renderBrowseButton({ fullWidth: true })}
 						</div>
-					</div>
+					</button>
 				)}
 			</div>
@ -681,9 +687,13 @@ export function DocumentUploadTab({
 						</span>
 					</AccordionTrigger>
 					<AccordionContent className="px-3 pb-3">
-						<div className="flex flex-wrap gap-1">
+						<div className="flex flex-wrap gap-1.5">
 							{supportedExtensions.map((ext) => (
-								<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0">
+								<Badge
 									key={ext}
 									variant="secondary"
 									className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
 								>
 									{ext}
 								</Badge>
 							))}
--- a/surfsense_web/components/tool-ui/citation/citation-list.tsx
+++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx
@ -2,13 +2,12 @@
 import type { LucideIcon } from "lucide-react";
 import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
 import NextImage from "next/image";
 import * as React from "react";
 import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media";
 import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
 import { Citation } from "./citation";
 import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
 import NextImage from 'next/image';
 const TYPE_ICONS: Record<CitationType, LucideIcon> = {
 	webpage: Globe,
--- a/Show more
+++ b/Show more
		`@ -0,0 +1,3 @@`
							`from app.tasks.document_processors._direct_converters import convert_file_directly`

							`__all__ = ["convert_file_directly"]`