Merge pull request #1207 from CREDO23/feat/kb-export-and-folder-upload

[Feat] KB Export, Folder Upload & Vision LLM for Image Processing
2026-07-20 23:21:06 +02:00 · 2026-04-11 13:56:57 -07:00 · 2026-04-11 13:56:57 -07:00 · 61b3f0d7e3
commit 61b3f0d7e3
parent 43c859047f ec3bd2f5f4
47 changed files with 1399 additions and 107 deletions
--- a/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
+++ b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
@ -0,0 +1,45 @@
 """123_add_enable_vision_llm_to_connectors
 Revision ID: 123
 Revises: 122
 Create Date: 2026-04-09
 Adds enable_vision_llm boolean column to search_source_connectors.
 Defaults to False so vision LLM image processing is opt-in.
 """
 from __future__ import annotations
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 # revision identifiers, used by Alembic.
 revision: str = "123"
 down_revision: str | None = "122"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    conn = op.get_bind()
    existing_columns = [
        col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
    ]
    if "enable_vision_llm" not in existing_columns:
        op.add_column(
            "search_source_connectors",
            sa.Column(
                "enable_vision_llm",
                sa.Boolean(),
                nullable=False,
                server_default=sa.text("false"),
            ),
        )
 def downgrade() -> None:
    op.drop_column("search_source_connectors", "enable_vision_llm")
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@ -44,6 +44,8 @@ async def _export_paper_content(
 async def download_and_extract_content(
    client: DropboxClient,
    file: dict[str, Any],
    *,
    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
    """Download a Dropbox file and extract its content as markdown.
@ -91,7 +93,7 @@ async def download_and_extract_content(
        from app.etl_pipeline.etl_document import EtlRequest
        from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-        result = await EtlPipelineService().extract(
+        result = await EtlPipelineService(vision_llm=vision_llm).extract(
            EtlRequest(file_path=temp_file_path, filename=file_name)
        )
        markdown = result.markdown_content
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -27,6 +27,8 @@ logger = logging.getLogger(__name__)
 async def download_and_extract_content(
    client: GoogleDriveClient,
    file: dict[str, Any],
    *,
    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
    """Download a Google Drive file and extract its content as markdown.
@ -103,7 +105,9 @@ async def download_and_extract_content(
        etl_filename = (
            file_name + extension if is_google_workspace_file(mime_type) else file_name
        )
-        markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
+        markdown = await _parse_file_to_markdown(
            temp_file_path, etl_filename, vision_llm=vision_llm
        )
        return markdown, drive_metadata, None
    except Exception as e:
@ -115,12 +119,14 @@ async def download_and_extract_content(
                os.unlink(temp_file_path)
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
    file_path: str, filename: str, *, vision_llm=None
 ) -> str:
    """Parse a local file to markdown using the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(file_path=file_path, filename=filename)
    )
    return result.markdown_content
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@ -16,6 +16,8 @@ logger = logging.getLogger(__name__)
 async def download_and_extract_content(
    client: OneDriveClient,
    file: dict[str, Any],
    *,
    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
    """Download a OneDrive file and extract its content as markdown.
@ -65,7 +67,9 @@ async def download_and_extract_content(
        if error:
            return None, metadata, error
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        markdown = await _parse_file_to_markdown(
            temp_file_path, file_name, vision_llm=vision_llm
        )
        return markdown, metadata, None
    except Exception as e:
@ -77,12 +81,14 @@ async def download_and_extract_content(
                os.unlink(temp_file_path)
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
    file_path: str, filename: str, *, vision_llm=None
 ) -> str:
    """Parse a local file to markdown using the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(file_path=file_path, filename=filename)
    )
    return result.markdown_content
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -1450,6 +1450,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
        Boolean, nullable=False, default=False, server_default="false"
    )
    # Vision LLM for image files - disabled by default to save cost/time.
    # When enabled, images are described via a vision language model instead
    # of falling back to the document parser.
    enable_vision_llm = Column(
        Boolean, nullable=False, default=False, server_default="false"
    )
    # Periodic indexing fields
    periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
    indexing_frequency_minutes = Column(Integer, nullable=True)
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
 class EtlPipelineService:
    """Single pipeline for extracting markdown from files. All callers use this."""
    def __init__(self, *, vision_llm=None):
        self._vision_llm = vision_llm
    async def extract(self, request: EtlRequest) -> EtlResult:
        category = classify_file(request.filename)
@ -47,8 +50,45 @@ class EtlPipelineService:
                content_type="audio",
            )
        if category == FileCategory.IMAGE:
            return await self._extract_image(request)
        return await self._extract_document(request)
    async def _extract_image(self, request: EtlRequest) -> EtlResult:
        if self._vision_llm:
            try:
                from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
                content = await parse_with_vision_llm(
                    request.file_path, request.filename, self._vision_llm
                )
                return EtlResult(
                    markdown_content=content,
                    etl_service="VISION_LLM",
                    content_type="image",
                )
            except Exception:
                logging.warning(
                    "Vision LLM failed for %s, falling back to document parser",
                    request.filename,
                    exc_info=True,
                )
        else:
            logging.info(
                "No vision LLM provided, falling back to document parser for %s",
                request.filename,
            )
        try:
            return await self._extract_document(request)
        except (EtlUnsupportedFileError, EtlServiceUnavailableError):
            raise EtlUnsupportedFileError(
                f"Cannot process image {request.filename}: vision LLM "
                f"{'failed' if self._vision_llm else 'not configured'} and "
                f"document parser does not support this format"
            ) from None
    async def _extract_document(self, request: EtlRequest) -> EtlResult:
        from pathlib import PurePosixPath
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -3,6 +3,7 @@ from pathlib import PurePosixPath
 from app.utils.file_extensions import (
    DOCUMENT_EXTENSIONS,
    IMAGE_EXTENSIONS,
    get_document_extensions_for_service,
 )
@ -105,6 +106,7 @@ class FileCategory(Enum):
    PLAINTEXT = "plaintext"
    AUDIO = "audio"
    DIRECT_CONVERT = "direct_convert"
    IMAGE = "image"
    UNSUPPORTED = "unsupported"
    DOCUMENT = "document"
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
        return FileCategory.AUDIO
    if suffix in DIRECT_CONVERT_EXTENSIONS:
        return FileCategory.DIRECT_CONVERT
    if suffix in IMAGE_EXTENSIONS:
        return FileCategory.IMAGE
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
    """Return True if *filename* cannot be processed by *etl_service*.
    Plaintext, audio, and direct-convert files are parser-agnostic and never
-    skipped.  Document files are checked against the per-parser extension set.
+    skipped.  Image and document files are checked against the per-parser
    extension set (images fall back to the document parser when no vision LLM
    is available, so the same service constraint applies).
    """
    category = classify_file(filename)
    if category == FileCategory.UNSUPPORTED:
        return True
-    if category == FileCategory.DOCUMENT:
+    if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
        suffix = PurePosixPath(filename).suffix.lower()
        return suffix not in get_document_extensions_for_service(etl_service)
    return False
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -0,0 +1,64 @@
 import asyncio
 import base64
 import os
 from langchain_core.messages import HumanMessage
 _PROMPT = (
    "Describe this image in markdown. "
    "Transcribe any visible text verbatim. "
    "Be concise but complete — let the image content guide the level of detail."
 )
 _MAX_IMAGE_BYTES = (
    5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
 _INVOKE_TIMEOUT_SECONDS = 120
 _EXT_TO_MIME: dict[str, str] = {
    ".png": "image/png",
    ".jpg": "image/jpeg",
    ".jpeg": "image/jpeg",
    ".gif": "image/gif",
    ".bmp": "image/bmp",
    ".tiff": "image/tiff",
    ".tif": "image/tiff",
    ".webp": "image/webp",
    ".svg": "image/svg+xml",
    ".heic": "image/heic",
    ".heif": "image/heif",
 }
 def _image_to_data_url(file_path: str) -> str:
    file_size = os.path.getsize(file_path)
    if file_size > _MAX_IMAGE_BYTES:
        raise ValueError(
            f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
            f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
        )
    ext = os.path.splitext(file_path)[1].lower()
    mime_type = _EXT_TO_MIME.get(ext)
    if not mime_type:
        raise ValueError(f"Unsupported image extension {ext!r}: {file_path}")
    with open(file_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("ascii")
    return f"data:{mime_type};base64,{encoded}"
 async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
    data_url = _image_to_data_url(file_path)
    message = HumanMessage(
        content=[
            {"type": "text", "text": _PROMPT},
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
    )
    response = await asyncio.wait_for(
        llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
    )
    text = response.content if hasattr(response, "content") else str(response)
    if not text or not text.strip():
        raise ValueError(f"Vision LLM returned empty content for {filename}")
    return text.strip()
--- a/surfsense_backend/app/routes/init.py
+++ b/surfsense_backend/app/routes/init.py
@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router
 from .documents_routes import router as documents_router
 from .dropbox_add_connector_route import router as dropbox_add_connector_router
 from .editor_routes import router as editor_router
 from .export_routes import router as export_router
 from .folders_routes import router as folders_router
 from .google_calendar_add_connector_route import (
    router as google_calendar_add_connector_router,
@ -58,6 +59,7 @@ router = APIRouter()
 router.include_router(search_spaces_router)
 router.include_router(rbac_router)  # RBAC routes for roles, members, invites
 router.include_router(editor_router)
 router.include_router(export_router)
 router.include_router(documents_router)
 router.include_router(folders_router)
 router.include_router(notes_router)
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -2,7 +2,7 @@
 import asyncio
 from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
-from pydantic import BaseModel as PydanticBaseModel
+from pydantic import BaseModel as PydanticBaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
@ -123,6 +123,7 @@ async def create_documents_file_upload(
    files: list[UploadFile],
    search_space_id: int = Form(...),
    should_summarize: bool = Form(False),
    use_vision_llm: bool = Form(False),
    session: AsyncSession = Depends(get_async_session),
    user: User = Depends(current_active_user),
    dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
@ -272,6 +273,7 @@ async def create_documents_file_upload(
                search_space_id=search_space_id,
                user_id=str(user.id),
                should_summarize=should_summarize,
                use_vision_llm=use_vision_llm,
            )
        return {
@ -1395,10 +1397,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
    mtime: float
 _MAX_MTIME_CHECK_FILES = 10_000
 class FolderMtimeCheckRequest(PydanticBaseModel):
    folder_name: str
    search_space_id: int
-    files: list[FolderMtimeCheckFile]
+    files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
 class FolderUnlinkRequest(PydanticBaseModel):
@ -1487,6 +1492,7 @@ async def folder_upload(
    relative_paths: str = Form(...),
    root_folder_id: int | None = Form(None),
    enable_summary: bool = Form(False),
    use_vision_llm: bool = Form(False),
    session: AsyncSession = Depends(get_async_session),
    user: User = Depends(current_active_user),
 ):
@ -1531,6 +1537,23 @@ async def folder_upload(
                f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
            )
    from app.services.folder_service import MAX_FOLDER_DEPTH
    max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
    if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
        raise HTTPException(
            status_code=400,
            detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
            f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
        )
    if root_folder_id:
        root_folder = await session.get(Folder, root_folder_id)
        if not root_folder or root_folder.search_space_id != search_space_id:
            raise HTTPException(
                status_code=404, detail="Root folder not found in this search space"
            )
    if not root_folder_id:
        watched_metadata = {
            "watched": True,
@ -1565,7 +1588,8 @@ async def folder_upload(
    async def _read_and_save(file: UploadFile, idx: int) -> dict:
        content = await file.read()
-        filename = file.filename or rel_paths[idx].split("/")[-1]
+        raw_name = file.filename or rel_paths[idx]
        filename = raw_name.split("/")[-1]
        def _write_temp() -> str:
            with tempfile.NamedTemporaryFile(
@ -1595,6 +1619,7 @@ async def folder_upload(
        folder_name=folder_name,
        root_folder_id=root_folder_id,
        enable_summary=enable_summary,
        use_vision_llm=use_vision_llm,
        file_mappings=list(file_mappings),
    )
--- a/surfsense_backend/app/routes/export_routes.py
+++ b/surfsense_backend/app/routes/export_routes.py
@ -0,0 +1,61 @@
 """Routes for exporting knowledge base content as ZIP."""
 import logging
 import os
 from fastapi import APIRouter, Depends, HTTPException, Query
 from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Permission, User, get_async_session
 from app.services.export_service import build_export_zip
 from app.users import current_active_user
 from app.utils.rbac import check_permission
 logger = logging.getLogger(__name__)
 router = APIRouter()
@router.get("/search-spaces/{search_space_id}/export")
 async def export_knowledge_base(
    search_space_id: int,
    folder_id: int | None = Query(None, description="Export only this folder's subtree"),
    session: AsyncSession = Depends(get_async_session),
    user: User = Depends(current_active_user),
 ):
    """Export documents as a ZIP of markdown files preserving folder structure."""
    await check_permission(
        session,
        user,
        search_space_id,
        Permission.DOCUMENTS_READ.value,
        "You don't have permission to export documents in this search space",
    )
    try:
        result = await build_export_zip(session, search_space_id, folder_id)
    except ValueError as e:
        raise HTTPException(status_code=404, detail=str(e)) from None
    def stream_and_cleanup():
        try:
            with open(result.zip_path, "rb") as f:
                while chunk := f.read(8192):
                    yield chunk
        finally:
            os.unlink(result.zip_path)
    headers = {
        "Content-Disposition": f'attachment; filename="{result.export_name}.zip"',
        "Content-Length": str(result.zip_size),
    }
    if result.skipped_docs:
        headers["X-Skipped-Documents"] = str(len(result.skipped_docs))
    return StreamingResponse(
        stream_and_cleanup(),
        media_type="application/zip",
        headers=headers,
    )
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel):
    last_indexed_at: datetime | None = None
    config: dict[str, Any]
    enable_summary: bool = False
    enable_vision_llm: bool = False
    periodic_indexing_enabled: bool = False
    indexing_frequency_minutes: int | None = None
    next_scheduled_at: datetime | None = None
@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel):
    last_indexed_at: datetime | None = None
    config: dict[str, Any] | None = None
    enable_summary: bool | None = None
    enable_vision_llm: bool | None = None
    periodic_indexing_enabled: bool | None = None
    indexing_frequency_minutes: int | None = None
    next_scheduled_at: datetime | None = None
--- a/surfsense_backend/app/services/export_service.py
+++ b/surfsense_backend/app/services/export_service.py
@ -0,0 +1,200 @@
 """Service for exporting knowledge base content as a ZIP archive."""
 import asyncio
 import logging
 import os
 import tempfile
 import zipfile
 from dataclasses import dataclass, field
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.db import Chunk, Document, Folder
 from app.services.folder_service import get_folder_subtree_ids
 logger = logging.getLogger(__name__)
 def _sanitize_filename(title: str) -> str:
    safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip()
    return safe[:80] or "document"
 def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
    """Build a mapping of folder_id -> full path string (e.g. 'Research/AI')."""
    id_to_folder = {f.id: f for f in folders}
    cache: dict[int, str] = {}
    def resolve(folder_id: int) -> str:
        if folder_id in cache:
            return cache[folder_id]
        folder = id_to_folder[folder_id]
        safe_name = _sanitize_filename(folder.name)
        if folder.parent_id is None or folder.parent_id not in id_to_folder:
            cache[folder_id] = safe_name
        else:
            cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}"
        return cache[folder_id]
    for f in folders:
        resolve(f.id)
    return cache
 async def _get_document_markdown(
    session: AsyncSession, document: Document
 ) -> str | None:
    """Resolve markdown content using the 3-tier fallback:
    1. source_markdown  2. blocknote_document conversion  3. chunk concatenation
    """
    if document.source_markdown is not None:
        return document.source_markdown
    if document.blocknote_document:
        from app.utils.blocknote_to_markdown import blocknote_to_markdown
        md = blocknote_to_markdown(document.blocknote_document)
        if md:
            return md
    chunk_result = await session.execute(
        select(Chunk.content)
        .filter(Chunk.document_id == document.id)
        .order_by(Chunk.id)
    )
    chunks = chunk_result.scalars().all()
    if chunks:
        return "\n\n".join(chunks)
    return None
@dataclass
 class ExportResult:
    zip_path: str
    export_name: str
    zip_size: int
    skipped_docs: list[str] = field(default_factory=list)
 async def build_export_zip(
    session: AsyncSession,
    search_space_id: int,
    folder_id: int | None = None,
 ) -> ExportResult:
    """Build a ZIP archive of markdown documents preserving folder structure.
    Returns an ExportResult with the path to the temp ZIP file.
    The caller is responsible for streaming and cleaning up the file.
    Raises ValueError if folder_id is provided but not found.
    """
    if folder_id is not None:
        folder = await session.get(Folder, folder_id)
        if not folder or folder.search_space_id != search_space_id:
            raise ValueError("Folder not found")
        target_folder_ids = set(await get_folder_subtree_ids(session, folder_id))
    else:
        target_folder_ids = None
    folder_query = select(Folder).where(Folder.search_space_id == search_space_id)
    if target_folder_ids is not None:
        folder_query = folder_query.where(Folder.id.in_(target_folder_ids))
    folder_result = await session.execute(folder_query)
    folders = list(folder_result.scalars().all())
    folder_path_map = _build_folder_path_map(folders)
    batch_size = 100
    base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
    if target_folder_ids is not None:
        base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
    base_doc_query = base_doc_query.order_by(Document.id)
    fd, tmp_path = tempfile.mkstemp(suffix=".zip")
    os.close(fd)
    used_paths: dict[str, int] = {}
    skipped_docs: list[str] = []
    is_first_batch = True
    try:
        offset = 0
        while True:
            batch_query = base_doc_query.limit(batch_size).offset(offset)
            batch_result = await session.execute(batch_query)
            documents = list(batch_result.scalars().all())
            if not documents:
                break
            entries: list[tuple[str, str]] = []
            for doc in documents:
                status = doc.status or {}
                state = (
                    status.get("state", "ready")
                    if isinstance(status, dict)
                    else "ready"
                )
                if state in ("pending", "processing"):
                    skipped_docs.append(doc.title or "Untitled")
                    continue
                markdown = await _get_document_markdown(session, doc)
                if not markdown or not markdown.strip():
                    continue
                if doc.folder_id and doc.folder_id in folder_path_map:
                    dir_path = folder_path_map[doc.folder_id]
                else:
                    dir_path = ""
                base_name = _sanitize_filename(doc.title or "Untitled")
                file_path = (
                    f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
                )
                if file_path in used_paths:
                    used_paths[file_path] += 1
                    suffix = used_paths[file_path]
                    file_path = (
                        f"{dir_path}/{base_name}_{suffix}.md"
                        if dir_path
                        else f"{base_name}_{suffix}.md"
                    )
                used_paths[file_path] = used_paths.get(file_path, 0) + 1
                entries.append((file_path, markdown))
            if entries:
                mode = "w" if is_first_batch else "a"
                batch_entries = entries
                def _write_batch(m: str = mode, e: list = batch_entries) -> None:
                    with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
                        for path, content in e:
                            zf.writestr(path, content)
                await asyncio.to_thread(_write_batch)
                is_first_batch = False
            offset += batch_size
        export_name = "knowledge-base"
        if folder_id is not None and folder_id in folder_path_map:
            export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0])
        return ExportResult(
            zip_path=tmp_path,
            export_name=export_name,
            zip_size=os.path.getsize(tmp_path),
            skipped_docs=skipped_docs,
        )
    except Exception:
        if os.path.exists(tmp_path):
            os.unlink(tmp_path)
        raise
--- a/surfsense_backend/app/services/task_dispatcher.py
+++ b/surfsense_backend/app/services/task_dispatcher.py
@ -19,6 +19,7 @@ class TaskDispatcher(Protocol):
        search_space_id: int,
        user_id: str,
        should_summarize: bool = False,
        use_vision_llm: bool = False,
    ) -> None: ...
@ -34,6 +35,7 @@ class CeleryTaskDispatcher:
        search_space_id: int,
        user_id: str,
        should_summarize: bool = False,
        use_vision_llm: bool = False,
    ) -> None:
        from app.tasks.celery_tasks.document_tasks import (
            process_file_upload_with_document_task,
@ -46,6 +48,7 @@ class CeleryTaskDispatcher:
            search_space_id=search_space_id,
            user_id=user_id,
            should_summarize=should_summarize,
            use_vision_llm=use_vision_llm,
        )
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@ -778,6 +778,7 @@ def process_file_upload_with_document_task(
    search_space_id: int,
    user_id: str,
    should_summarize: bool = False,
    use_vision_llm: bool = False,
 ):
    """
    Celery task to process uploaded file with existing pending document.
@ -833,6 +834,7 @@ def process_file_upload_with_document_task(
                search_space_id,
                user_id,
                should_summarize=should_summarize,
                use_vision_llm=use_vision_llm,
            )
        )
        logger.info(
@ -869,6 +871,7 @@ async def _process_file_with_document(
    search_space_id: int,
    user_id: str,
    should_summarize: bool = False,
    use_vision_llm: bool = False,
 ):
    """
    Process file and update existing pending document status.
@ -971,6 +974,7 @@ async def _process_file_with_document(
                log_entry=log_entry,
                notification=notification,
                should_summarize=should_summarize,
                use_vision_llm=use_vision_llm,
            )
            # Update notification on success
@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task(
    root_folder_id: int,
    enable_summary: bool,
    file_mappings: list[dict],
    use_vision_llm: bool = False,
 ):
    """Celery task to index files uploaded from the desktop app."""
    loop = asyncio.new_event_loop()
@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task(
                root_folder_id=root_folder_id,
                enable_summary=enable_summary,
                file_mappings=file_mappings,
                use_vision_llm=use_vision_llm,
            )
        )
    finally:
@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async(
    root_folder_id: int,
    enable_summary: bool,
    file_mappings: list[dict],
    use_vision_llm: bool = False,
 ):
    """Run upload-based folder indexing with notification + heartbeat."""
    file_count = len(file_mappings)
@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async(
                enable_summary=enable_summary,
                file_mappings=file_mappings,
                on_heartbeat_callback=_heartbeat_progress,
                use_vision_llm=use_vision_llm,
            )
            if notification:
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@ -164,6 +164,7 @@ async def _download_files_parallel(
    enable_summary: bool,
    max_concurrency: int = 3,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
    """Download and ETL files in parallel. Returns (docs, failed_count)."""
    results: list[ConnectorDocument] = []
@ -176,7 +177,7 @@ async def _download_files_parallel(
        nonlocal last_heartbeat, completed_count
        async with sem:
            markdown, db_metadata, error = await download_and_extract_content(
-                dropbox_client, file
+                dropbox_client, file, vision_llm=vision_llm
            )
            if error or not markdown:
                file_name = file.get("name", "Unknown")
@ -224,6 +225,7 @@ async def _download_and_index(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int]:
    """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
    connector_docs, download_failed = await _download_files_parallel(
@ -234,6 +236,7 @@ async def _download_and_index(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    batch_indexed = 0
@ -287,6 +290,7 @@ async def _index_with_delta_sync(
    max_files: int,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int, str]:
    """Delta sync using Dropbox cursor-based change tracking.
@ -359,6 +363,7 @@ async def _index_with_delta_sync(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    indexed = renamed_count + batch_indexed
@ -384,6 +389,7 @@ async def _index_full_scan(
    incremental_sync: bool = True,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int]:
    """Full scan indexing of a folder.
@ -469,6 +475,7 @@ async def _index_full_scan(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -498,6 +505,7 @@ async def _index_selected_files(
    enable_summary: bool,
    incremental_sync: bool = True,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
    page_limit_service = PageLimitService(session)
@ -557,6 +565,7 @@ async def _index_selected_files(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -621,6 +630,13 @@ async def index_dropbox_files(
            return 0, 0, error_msg, 0
        connector_enable_summary = getattr(connector, "enable_summary", True)
        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
        vision_llm = None
        if connector_enable_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm = await get_vision_llm(session, search_space_id)
        dropbox_client = DropboxClient(session, connector_id)
        indexing_options = items_dict.get("indexing_options", {})
@ -650,6 +666,7 @@ async def index_dropbox_files(
                user_id=user_id,
                enable_summary=connector_enable_summary,
                incremental_sync=incremental_sync,
                vision_llm=vision_llm,
            )
            total_indexed += indexed
            total_skipped += skipped
@ -684,6 +701,7 @@ async def index_dropbox_files(
                    log_entry,
                    max_files,
                    enable_summary=connector_enable_summary,
                    vision_llm=vision_llm,
                )
                folder_cursors[folder_path] = new_cursor
                total_unsupported += unsup
@ -703,6 +721,7 @@ async def index_dropbox_files(
                    include_subfolders,
                    incremental_sync=incremental_sync,
                    enable_summary=connector_enable_summary,
                    vision_llm=vision_llm,
                )
                total_unsupported += unsup
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -261,6 +261,7 @@ async def _download_files_parallel(
    enable_summary: bool,
    max_concurrency: int = 3,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
    """Download and ETL files in parallel, returning ConnectorDocuments.
@ -276,7 +277,7 @@ async def _download_files_parallel(
        nonlocal last_heartbeat, completed_count
        async with sem:
            markdown, drive_metadata, error = await download_and_extract_content(
-                drive_client, file
+                drive_client, file, vision_llm=vision_llm
            )
            if error or not markdown:
                file_name = file.get("name", "Unknown")
@ -322,6 +323,7 @@ async def _process_single_file(
    search_space_id: int,
    user_id: str,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int]:
    """Download, extract, and index a single Drive file via the pipeline.
@ -343,7 +345,7 @@ async def _process_single_file(
        await page_limit_service.check_page_limit(user_id, estimated_pages)
        markdown, drive_metadata, error = await download_and_extract_content(
-            drive_client, file
+            drive_client, file, vision_llm=vision_llm
        )
        if error or not markdown:
            logger.warning(f"ETL failed for {file_name}: {error}")
@ -433,6 +435,7 @@ async def _download_and_index(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int]:
    """Phase 2+3: parallel download then parallel indexing.
@ -446,6 +449,7 @@ async def _download_and_index(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    batch_indexed = 0
@ -476,6 +480,7 @@ async def _index_selected_files(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline.
@ -540,6 +545,7 @@ async def _index_selected_files(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -573,6 +579,7 @@ async def _index_full_scan(
    include_subfolders: bool = False,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int]:
    """Full scan indexing of a folder.
@ -703,6 +710,7 @@ async def _index_full_scan(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -736,6 +744,7 @@ async def _index_with_delta_sync(
    include_subfolders: bool = False,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int]:
    """Delta sync using change tracking.
@ -844,6 +853,7 @@ async def _index_with_delta_sync(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -947,6 +957,11 @@ async def index_google_drive_files(
                )
        connector_enable_summary = getattr(connector, "enable_summary", True)
        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
        vision_llm = None
        if connector_enable_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm = await get_vision_llm(session, search_space_id)
        drive_client = GoogleDriveClient(
            session, connector_id, credentials=pre_built_credentials
        )
@ -986,6 +1001,7 @@ async def index_google_drive_files(
                include_subfolders,
                on_heartbeat_callback,
                connector_enable_summary,
                vision_llm=vision_llm,
            )
            documents_unsupported += du
            logger.info("Running reconciliation scan after delta sync")
@ -1004,6 +1020,7 @@ async def index_google_drive_files(
                include_subfolders,
                on_heartbeat_callback,
                connector_enable_summary,
                vision_llm=vision_llm,
            )
            documents_indexed += ri
            documents_skipped += rs
@ -1029,6 +1046,7 @@ async def index_google_drive_files(
                include_subfolders,
                on_heartbeat_callback,
                connector_enable_summary,
                vision_llm=vision_llm,
            )
        if documents_indexed > 0 or can_use_delta:
@ -1146,6 +1164,11 @@ async def index_google_drive_single_file(
                )
        connector_enable_summary = getattr(connector, "enable_summary", True)
        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
        vision_llm = None
        if connector_enable_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm = await get_vision_llm(session, search_space_id)
        drive_client = GoogleDriveClient(
            session, connector_id, credentials=pre_built_credentials
        )
@ -1168,6 +1191,7 @@ async def index_google_drive_single_file(
            search_space_id,
            user_id,
            connector_enable_summary,
            vision_llm=vision_llm,
        )
        await session.commit()
@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files(
                return 0, 0, [error_msg]
        connector_enable_summary = getattr(connector, "enable_summary", True)
        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
        vision_llm = None
        if connector_enable_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm = await get_vision_llm(session, search_space_id)
        drive_client = GoogleDriveClient(
            session, connector_id, credentials=pre_built_credentials
        )
@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files(
            user_id=user_id,
            enable_summary=connector_enable_summary,
            on_heartbeat=on_heartbeat_callback,
            vision_llm=vision_llm,
        )
        if unsupported > 0:
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@ -153,16 +153,16 @@ def scan_folder(
    return files
-async def _read_file_content(file_path: str, filename: str) -> str:
+async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
    """Read file content via the unified ETL pipeline.
-    All file types (plaintext, audio, direct-convert, document) are handled
+    All file types (plaintext, audio, direct-convert, document, image) are
-    by ``EtlPipelineService``.
+    handled by ``EtlPipelineService``.
    """
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(file_path=file_path, filename=filename)
    )
    return result.markdown_content
@ -199,12 +199,14 @@ async def _compute_file_content_hash(
    file_path: str,
    filename: str,
    search_space_id: int,
    *,
    vision_llm=None,
 ) -> tuple[str, str]:
    """Read a file (via ETL if needed) and compute its content hash.
    Returns (content_text, content_hash).
    """
-    content = await _read_file_content(file_path, filename)
+    content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
    return content, _content_hash(content, search_space_id)
@ -704,7 +706,9 @@ async def index_local_folder(
                    try:
                        content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
                            file_info["relative_path"],
                            search_space_id,
                        )
                    except Exception as read_err:
                        logger.warning(f"Could not read {file_path_abs}: {read_err}")
@ -738,7 +742,9 @@ async def index_local_folder(
                    try:
                        content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
                            file_info["relative_path"],
                            search_space_id,
                        )
                    except Exception as read_err:
                        logger.warning(f"Could not read {file_path_abs}: {read_err}")
@ -1264,6 +1270,7 @@ async def index_uploaded_files(
    enable_summary: bool,
    file_mappings: list[dict],
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    use_vision_llm: bool = False,
 ) -> tuple[int, int, str | None]:
    """Index files uploaded from the desktop app via temp paths.
@ -1300,6 +1307,12 @@ async def index_uploaded_files(
        pipeline = IndexingPipelineService(session)
        llm = await get_user_long_context_llm(session, user_id, search_space_id)
        vision_llm_instance = None
        if use_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm_instance = await get_vision_llm(session, search_space_id)
        indexed_count = 0
        failed_count = 0
        errors: list[str] = []
@ -1347,7 +1360,8 @@ async def index_uploaded_files(
                try:
                    content, content_hash = await _compute_file_content_hash(
-                        temp_path, filename, search_space_id
+                        temp_path, filename, search_space_id,
                        vision_llm=vision_llm_instance,
                    )
                except Exception as e:
                    logger.warning(f"Could not read {relative_path}: {e}")
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@ -171,6 +171,7 @@ async def _download_files_parallel(
    enable_summary: bool,
    max_concurrency: int = 3,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
    """Download and ETL files in parallel. Returns (docs, failed_count)."""
    results: list[ConnectorDocument] = []
@ -183,7 +184,7 @@ async def _download_files_parallel(
        nonlocal last_heartbeat, completed_count
        async with sem:
            markdown, od_metadata, error = await download_and_extract_content(
-                onedrive_client, file
+                onedrive_client, file, vision_llm=vision_llm
            )
            if error or not markdown:
                file_name = file.get("name", "Unknown")
@ -231,6 +232,7 @@ async def _download_and_index(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int]:
    """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
    connector_docs, download_failed = await _download_files_parallel(
@ -241,6 +243,7 @@ async def _download_and_index(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    batch_indexed = 0
@ -293,6 +296,7 @@ async def _index_selected_files(
    user_id: str,
    enable_summary: bool,
    on_heartbeat: HeartbeatCallbackType | None = None,
    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
    page_limit_service = PageLimitService(session)
@ -343,6 +347,7 @@ async def _index_selected_files(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -375,6 +380,7 @@ async def _index_full_scan(
    include_subfolders: bool = True,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int]:
    """Full scan indexing of a folder.
@ -450,6 +456,7 @@ async def _index_full_scan(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -481,6 +488,7 @@ async def _index_with_delta_sync(
    max_files: int,
    on_heartbeat_callback: HeartbeatCallbackType | None = None,
    enable_summary: bool = True,
    vision_llm=None,
 ) -> tuple[int, int, int, str | None]:
    """Delta sync using OneDrive change tracking.
@ -573,6 +581,7 @@ async def _index_with_delta_sync(
        user_id=user_id,
        enable_summary=enable_summary,
        on_heartbeat=on_heartbeat_callback,
        vision_llm=vision_llm,
    )
    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -643,6 +652,12 @@ async def index_onedrive_files(
            return 0, 0, error_msg, 0
        connector_enable_summary = getattr(connector, "enable_summary", True)
        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
        vision_llm = None
        if connector_enable_vision_llm:
            from app.services.llm_service import get_vision_llm
            vision_llm = await get_vision_llm(session, search_space_id)
        onedrive_client = OneDriveClient(session, connector_id)
        indexing_options = items_dict.get("indexing_options", {})
@ -666,6 +681,7 @@ async def index_onedrive_files(
                search_space_id=search_space_id,
                user_id=user_id,
                enable_summary=connector_enable_summary,
                vision_llm=vision_llm,
            )
            total_indexed += indexed
            total_skipped += skipped
@ -695,6 +711,7 @@ async def index_onedrive_files(
                    log_entry,
                    max_files,
                    enable_summary=connector_enable_summary,
                    vision_llm=vision_llm,
                )
                total_indexed += indexed
                total_skipped += skipped
@ -721,6 +738,7 @@ async def index_onedrive_files(
                    max_files,
                    include_subfolders,
                    enable_summary=connector_enable_summary,
                    vision_llm=vision_llm,
                )
                total_indexed += ri
                total_skipped += rs
@ -740,6 +758,7 @@ async def index_onedrive_files(
                    max_files,
                    include_subfolders,
                    enable_summary=connector_enable_summary,
                    vision_llm=vision_llm,
                )
                total_indexed += indexed
                total_skipped += skipped
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -46,6 +46,7 @@ class _ProcessingContext:
    log_entry: Log
    connector: dict | None = None
    notification: Notification | None = None
    use_vision_llm: bool = False
    enable_summary: bool = field(init=False)
    def __post_init__(self) -> None:
@ -118,9 +119,13 @@ async def _log_page_divergence(
 async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
+    """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
    from app.etl_pipeline.etl_document import EtlRequest
    from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
    from app.etl_pipeline.file_classifier import (
        FileCategory,
        classify_file as etl_classify,
    )
    await _notify(ctx, "parsing", "Processing file")
    await ctx.task_logger.log_task_progress(
@ -129,7 +134,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
        {"processing_stage": "extracting"},
    )
-    etl_result = await EtlPipelineService().extract(
+    vision_llm = None
    if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
        from app.services.llm_service import get_vision_llm
        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
    )
@ -278,6 +289,7 @@ async def process_file_in_background(
    log_entry: Log,
    connector: dict | None = None,
    notification: Notification | None = None,
    use_vision_llm: bool = False,
 ) -> Document | None:
    ctx = _ProcessingContext(
        session=session,
@ -289,6 +301,7 @@ async def process_file_in_background(
        log_entry=log_entry,
        connector=connector,
        notification=notification,
        use_vision_llm=use_vision_llm,
    )
    try:
@ -333,11 +346,13 @@ async def process_file_in_background(
 async def _extract_file_content(
    file_path: str,
    filename: str,
    search_space_id: int,
    session: AsyncSession,
    user_id: str,
    task_logger: TaskLoggingService,
    log_entry: Log,
    notification: Notification | None,
    use_vision_llm: bool = False,
 ) -> tuple[str, str]:
    """
    Extract markdown content from a file regardless of type.
@ -360,6 +375,7 @@ async def _extract_file_content(
            FileCategory.PLAINTEXT: "Reading file",
            FileCategory.DIRECT_CONVERT: "Converting file",
            FileCategory.AUDIO: "Transcribing audio",
            FileCategory.IMAGE: "Analyzing image",
            FileCategory.UNSUPPORTED: "Unsupported file type",
            FileCategory.DOCUMENT: "Extracting content",
        }
@ -383,7 +399,13 @@ async def _extract_file_content(
        estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
        await page_limit_service.check_page_limit(user_id, estimated_pages)
-    result = await EtlPipelineService().extract(
+    vision_llm = None
    if use_vision_llm and category == FileCategory.IMAGE:
        from app.services.llm_service import get_vision_llm
        vision_llm = await get_vision_llm(session, search_space_id)
    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(
            file_path=file_path,
            filename=filename,
@ -417,6 +439,7 @@ async def process_file_in_background_with_document(
    connector: dict | None = None,
    notification: Notification | None = None,
    should_summarize: bool = False,
    use_vision_llm: bool = False,
 ) -> Document | None:
    """
    Process file and update existing pending document (2-phase pattern).
@ -439,11 +462,13 @@ async def process_file_in_background_with_document(
        markdown_content, etl_service = await _extract_file_content(
            file_path,
            filename,
            search_space_id,
            session,
            user_id,
            task_logger,
            log_entry,
            notification,
            use_vision_llm=use_vision_llm,
        )
        if not markdown_content:
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
 Image extensions intentionally remain in the per-parser sets for fallback
 compatibility.  IMAGE_EXTENSIONS is used only for routing classification.
 """
 from pathlib import PurePosixPath
 # ---------------------------------------------------------------------------
 # Image extensions (used by file_classifier for routing to vision LLM)
 # ---------------------------------------------------------------------------
 IMAGE_EXTENSIONS: frozenset[str] = frozenset(
    {
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".bmp",
        ".tiff",
        ".tif",
        ".webp",
        ".svg",
        ".heic",
        ".heif",
    }
 )
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@ -69,6 +69,7 @@ class InlineTaskDispatcher:
        search_space_id: int,
        user_id: str,
        should_summarize: bool = False,
        use_vision_llm: bool = False,
    ) -> None:
        from app.tasks.celery_tasks.document_tasks import (
            _process_file_with_document,
@ -82,6 +83,7 @@ class InlineTaskDispatcher:
                search_space_id,
                user_id,
                should_summarize=should_summarize,
                use_vision_llm=use_vision_llm,
            )
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore(
    active = 0
    peak = 0
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        nonlocal active, peak
        async with lock:
            active += 1
@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
    monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        await asyncio.sleep(0.05)
        return _mock_extract_ok(file["id"], file["name"])
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
    active = 0
    peak = 0
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        nonlocal active, peak
        async with lock:
            active += 1
@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
    monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        await asyncio.sleep(0.05)
        return _mock_extract_ok(file["id"], file["name"])
--- a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
    active = 0
    peak = 0
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        nonlocal active, peak
        async with lock:
            active += 1
@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
    monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
        await asyncio.sleep(0.05)
        return _mock_extract_ok(file["id"], file["name"])
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -431,7 +431,7 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
    mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
    mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
-    with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
+    with pytest.raises(EtlUnsupportedFileError, match="document parser does not support this format"):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(heif_file), filename="photo.heif")
        )
@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
        ("doc.docx", "document"),
        ("slides.pptx", "document"),
        ("sheet.xlsx", "document"),
-        ("photo.png", "document"),
+        ("photo.png", "image"),
-        ("photo.jpg", "document"),
+        ("photo.jpg", "image"),
        ("photo.webp", "image"),
        ("photo.gif", "image"),
        ("photo.heic", "image"),
        ("book.epub", "document"),
        ("letter.odt", "document"),
        ("readme.md", "plaintext"),
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(eml_file), filename="mail.eml")
        )
 # ---------------------------------------------------------------------------
 # Image extraction via vision LLM
 # ---------------------------------------------------------------------------
 async def test_extract_image_with_vision_llm(tmp_path):
    """An image file is analyzed by the vision LLM when provided."""
    from unittest.mock import AsyncMock, MagicMock
    img_file = tmp_path / "photo.png"
    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
    fake_response = MagicMock()
    fake_response.content = "# A photo of a sunset over the ocean"
    fake_llm = AsyncMock()
    fake_llm.ainvoke.return_value = fake_response
    service = EtlPipelineService(vision_llm=fake_llm)
    result = await service.extract(
        EtlRequest(file_path=str(img_file), filename="photo.png")
    )
    assert result.markdown_content == "# A photo of a sunset over the ocean"
    assert result.etl_service == "VISION_LLM"
    assert result.content_type == "image"
    fake_llm.ainvoke.assert_called_once()
 async def test_extract_image_falls_back_to_document_without_vision_llm(
    tmp_path, mocker
 ):
    """Without a vision LLM, image files fall back to the document parser."""
    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
    fake_docling = mocker.AsyncMock()
    fake_docling.process_document.return_value = {"content": "# OCR text from image"}
    mocker.patch(
        "app.services.docling_service.create_docling_service",
        return_value=fake_docling,
    )
    img_file = tmp_path / "scan.png"
    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
    service = EtlPipelineService()
    result = await service.extract(
        EtlRequest(file_path=str(img_file), filename="scan.png")
    )
    assert result.markdown_content == "# OCR text from image"
    assert result.etl_service == "DOCLING"
    assert result.content_type == "document"
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
    )
    assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
 # ---------------------------------------------------------------------------
 # IMAGE_EXTENSIONS
 # ---------------------------------------------------------------------------
@pytest.mark.parametrize(
    "ext",
    [
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".bmp",
        ".tiff",
        ".tif",
        ".webp",
        ".svg",
        ".heic",
        ".heif",
    ],
 )
 def test_image_extensions_contains_expected(ext):
    from app.utils.file_extensions import IMAGE_EXTENSIONS
    assert ext in IMAGE_EXTENSIONS
 def test_image_extensions_are_subset_of_document_extensions():
    """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
    from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
    missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
    assert not missing, (
        f"Image extensions missing from document sets (breaks fallback): {missing}"
    )
--- a/surfsense_desktop/.env.example
+++ b/surfsense_desktop/.env.example
@ -0,0 +1,10 @@
 # Electron-specific build-time configuration.
 # Set before running pnpm dist:mac / dist:win / dist:linux.
 # The hosted web frontend URL. Used to intercept OAuth redirects and keep them
 # inside the desktop app. Set to your production frontend domain.
 HOSTED_FRONTEND_URL=https://surfsense.net
 # PostHog analytics (leave empty to disable)
 POSTHOG_KEY=
 POSTHOG_HOST=https://assets.surfsense.com
--- a/surfsense_desktop/.gitignore
+++ b/surfsense_desktop/.gitignore
@ -1,3 +1,4 @@
 node_modules/
 dist/
 release/
 .env
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
 			enableVisionLlm,
 			allConnectors,
 			viewingAccountsType,
 			viewingMCPList,
@ -109,6 +110,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			setPeriodicEnabled,
 			setFrequencyMinutes,
 			setEnableSummary,
 			setEnableVisionLlm,
 			handleOpenChange,
 			handleTabChange,
 			handleScroll,
@ -279,6 +281,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
 							enableVisionLlm={enableVisionLlm}
 							isSaving={isSaving}
 							isDisconnecting={isDisconnecting}
 							isIndexing={indexingConnectorIds.has(editingConnector.id)}
@ -288,6 +291,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
 							onEnableVisionLlmChange={setEnableVisionLlm}
 							onSave={() => {
 								startIndexing(editingConnector.id);
 								handleSaveConnector(() => refreshConnectors());
@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
 							enableVisionLlm={enableVisionLlm}
 							isStartingIndexing={isStartingIndexing}
 							isFromOAuth={isFromOAuth}
 							onStartDateChange={setStartDate}
@ -343,6 +348,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
 							onEnableVisionLlmChange={setEnableVisionLlm}
 							onConfigChange={setIndexingConnectorConfig}
 							onStartIndexing={() => {
 								if (indexingConfig.connectorId) {
--- a/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
@ -0,0 +1,25 @@
 "use client";
 import type { FC } from "react";
 import { Switch } from "@/components/ui/switch";
 interface VisionLLMConfigProps {
 	enabled: boolean;
 	onEnabledChange: (enabled: boolean) => void;
 }
 export const VisionLLMConfig: FC<VisionLLMConfigProps> = ({ enabled, onEnabledChange }) => {
 	return (
 		<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
 			<div className="flex items-center justify-between">
 				<div className="space-y-1">
 					<h3 className="font-medium text-sm sm:text-base">Enable Vision LLM</h3>
 					<p className="text-xs sm:text-sm text-muted-foreground">
 						Describes images using AI vision (costly, slower)
 					</p>
 				</div>
 				<Switch checked={enabled} onCheckedChange={onEnabledChange} />
 			</div>
 		</div>
 	);
 };
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@ -15,6 +15,7 @@ import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
 import { VisionLLMConfig } from "../../components/vision-llm-config";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
 import { getConnectorConfigComponent } from "../index";
@ -38,6 +39,7 @@ interface ConnectorEditViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
 	enableVisionLlm: boolean;
 	isSaving: boolean;
 	isDisconnecting: boolean;
 	isIndexing?: boolean;
@ -47,6 +49,7 @@ interface ConnectorEditViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
 	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onSave: () => void;
 	onDisconnect: () => void;
 	onBack: () => void;
@ -62,6 +65,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
 	enableVisionLlm,
 	isSaving,
 	isDisconnecting,
 	isIndexing = false,
@ -71,6 +75,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
 	onEnableVisionLlmChange,
 	onSave,
 	onDisconnect,
 	onBack,
@ -272,6 +277,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 								{/* Vision LLM toggle - only for file-based connectors */}
 								{(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
 									connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
 									connector.connector_type === "DROPBOX_CONNECTOR" ||
 									connector.connector_type === "ONEDRIVE_CONNECTOR") && (
 									<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
 								)}
 								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
 								{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
 									connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
@ -10,6 +10,7 @@ import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
 import { VisionLLMConfig } from "../../components/vision-llm-config";
 import type { IndexingConfigState } from "../../constants/connector-constants";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
 import { getConnectorConfigComponent } from "../index";
@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
 	enableVisionLlm: boolean;
 	isStartingIndexing: boolean;
 	isFromOAuth?: boolean;
 	onStartDateChange: (date: Date | undefined) => void;
@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
 	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onConfigChange?: (config: Record<string, unknown>) => void;
 	onStartIndexing: () => void;
 	onSkip: () => void;
@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
 	enableVisionLlm,
 	isStartingIndexing,
 	isFromOAuth = false,
 	onStartDateChange,
@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
 	onEnableVisionLlmChange,
 	onConfigChange,
 	onStartIndexing,
 	onSkip,
@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 								{/* Vision LLM toggle - only for file-based connectors */}
 								{(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" ||
 									config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
 									config.connectorType === "DROPBOX_CONNECTOR" ||
 									config.connectorType === "ONEDRIVE_CONNECTOR") && (
 									<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
 								)}
 								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
 								{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@ -80,6 +80,7 @@ export const useConnectorDialog = () => {
 	const [periodicEnabled, setPeriodicEnabled] = useState(false);
 	const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
 	const [enableSummary, setEnableSummary] = useState(false);
 	const [enableVisionLlm, setEnableVisionLlm] = useState(false);
 	// Edit mode state
 	const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
@ -621,6 +622,7 @@ export const useConnectorDialog = () => {
 									setPeriodicEnabled(false);
 									setFrequencyMinutes("1440");
 									setEnableSummary(connector.enable_summary ?? false);
 									setEnableVisionLlm(connector.enable_vision_llm ?? false);
 									setStartDate(undefined);
 									setEndDate(undefined);
@ -763,12 +765,13 @@ export const useConnectorDialog = () => {
 				const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
 				// Update connector with summary, periodic sync settings, and config changes
-				if (enableSummary || periodicEnabled || indexingConnectorConfig) {
+			if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) {
-					const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
+				const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
 					await updateConnector({
 						id: indexingConfig.connectorId,
 						data: {
 							enable_summary: enableSummary,
 							enable_vision_llm: enableVisionLlm,
 							...(periodicEnabled && {
 								periodic_indexing_enabled: true,
 								indexing_frequency_minutes: frequency,
@ -896,6 +899,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
 			enableVisionLlm,
 			indexingConnectorConfig,
 			setIsOpen,
 		]
@ -960,6 +964,7 @@ export const useConnectorDialog = () => {
 			setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
 			setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
 			setEnableSummary(connector.enable_summary ?? false);
 			setEnableVisionLlm(connector.enable_vision_llm ?? false);
 			setStartDate(undefined);
 			setEndDate(undefined);
 		},
@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => {
 					data: {
 						name: connectorName || editingConnector.name,
 						enable_summary: enableSummary,
 						enable_vision_llm: enableVisionLlm,
 						periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
 						indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
 						config: connectorConfig || editingConnector.config,
@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
 			enableVisionLlm,
 			getFrequencyLabel,
 			connectorConfig,
 			connectorName,
@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => {
 					setPeriodicEnabled(false);
 					setFrequencyMinutes("1440");
 					setEnableSummary(false);
 					setEnableVisionLlm(false);
 				}
 			}
 		},
@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => {
 		periodicEnabled,
 		frequencyMinutes,
 		enableSummary,
 		enableVisionLlm,
 		searchSpaceId,
 		allConnectors,
 		viewingAccountsType,
@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => {
 		setPeriodicEnabled,
 		setFrequencyMinutes,
 		setEnableSummary,
 		setEnableVisionLlm,
 		setConnectorName,
 		// Handlers
--- a/surfsense_web/components/documents/DocumentsFilters.tsx
+++ b/surfsense_web/components/documents/DocumentsFilters.tsx
@ -1,6 +1,6 @@
 "use client";
-import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
+import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react";
 import { useTranslations } from "next-intl";
 import React, { useCallback, useMemo, useRef, useState } from "react";
 import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
@ -20,6 +20,8 @@ export function DocumentsFilters({
 	onToggleType,
 	activeTypes,
 	onCreateFolder,
 	onExportKB,
 	isExporting,
 }: {
 	typeCounts: Partial<Record<DocumentTypeEnum, number>>;
 	onSearch: (v: string) => void;
@ -27,6 +29,8 @@ export function DocumentsFilters({
 	onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
 	activeTypes: DocumentTypeEnum[];
 	onCreateFolder?: () => void;
 	onExportKB?: () => void;
 	isExporting?: boolean;
 }) {
 	const t = useTranslations("documents");
 	const id = React.useId();
@ -84,6 +88,31 @@ export function DocumentsFilters({
 						</Tooltip>
 					)}
 					{onExportKB && (
 						<Tooltip>
 							<TooltipTrigger asChild>
 								<ToggleGroupItem
 									value="export"
 									disabled={isExporting}
 									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
 									onClick={(e) => {
 										e.preventDefault();
 										onExportKB();
 									}}
 								>
 									{isExporting ? (
 										<Loader2 size={14} className="animate-spin" />
 									) : (
 										<Download size={14} />
 									)}
 								</ToggleGroupItem>
 							</TooltipTrigger>
 							<TooltipContent>
 								{isExporting ? "Exporting…" : "Export knowledge base"}
 							</TooltipContent>
 						</Tooltip>
 					)}
 					<Popover>
 						<Tooltip>
 							<TooltipTrigger asChild>
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@ -4,6 +4,7 @@ import {
 	AlertCircle,
 	ChevronDown,
 	ChevronRight,
 	Download,
 	Eye,
 	EyeOff,
 	Folder,
@ -80,6 +81,7 @@ interface FolderNodeProps {
 	isWatched?: boolean;
 	onRescan?: (folder: FolderDisplay) => void | Promise<void>;
 	onStopWatching?: (folder: FolderDisplay) => void;
 	onExportFolder?: (folder: FolderDisplay) => void;
 }
 function getDropZone(
@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({
 	isWatched,
 	onRescan,
 	onStopWatching,
 	onExportFolder,
 }: FolderNodeProps) {
 	const [renameValue, setRenameValue] = useState(folder.name);
 	const inputRef = useRef<HTMLInputElement>(null);
@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({
 									<Move className="mr-2 h-4 w-4" />
 									Move to...
 								</DropdownMenuItem>
 								{onExportFolder && (
 									<DropdownMenuItem
 										onClick={(e) => {
 											e.stopPropagation();
 											onExportFolder(folder);
 										}}
 									>
 										<Download className="mr-2 h-4 w-4" />
 										Export folder
 									</DropdownMenuItem>
 								)}
 								<DropdownMenuItem
 									onClick={(e) => {
 										e.stopPropagation();
@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({
 						<Move className="mr-2 h-4 w-4" />
 						Move to...
 					</ContextMenuItem>
 					{onExportFolder && (
 						<ContextMenuItem onClick={() => onExportFolder(folder)}>
 							<Download className="mr-2 h-4 w-4" />
 							Export folder
 						</ContextMenuItem>
 					)}
 					<ContextMenuItem onClick={() => onDelete(folder)}>
 						<Trash2 className="mr-2 h-4 w-4" />
 						Delete
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@ -44,6 +44,7 @@ interface FolderTreeViewProps {
 	watchedFolderIds?: Set<number>;
 	onRescanFolder?: (folder: FolderDisplay) => void;
 	onStopWatchingFolder?: (folder: FolderDisplay) => void;
 	onExportFolder?: (folder: FolderDisplay) => void;
 }
 function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
@ -81,6 +82,7 @@ export function FolderTreeView({
 	watchedFolderIds,
 	onRescanFolder,
 	onStopWatchingFolder,
 	onExportFolder,
 }: FolderTreeViewProps) {
 	const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
@ -259,6 +261,7 @@ export function FolderTreeView({
 					isWatched={watchedFolderIds?.has(f.id)}
 					onRescan={onRescanFolder}
 					onStopWatching={onStopWatchingFolder}
 					onExportFolder={onExportFolder}
 				/>
 			);
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@ -406,6 +406,160 @@ export function DocumentsSidebar({
 		setFolderPickerOpen(true);
 	}, []);
 	const [isExportingKB, setIsExportingKB] = useState(false);
 	const [exportWarningOpen, setExportWarningOpen] = useState(false);
 	const [exportWarningContext, setExportWarningContext] = useState<{
 		type: "kb" | "folder";
 		folder?: FolderDisplay;
 		pendingCount: number;
 	} | null>(null);
 	const pendingDocuments = useMemo(
 		() =>
 			treeDocuments.filter(
 				(d) => d.status?.state === "pending" || d.status?.state === "processing"
 			),
 		[treeDocuments]
 	);
 	const doExport = useCallback(async (url: string, downloadName: string) => {
 		const response = await authenticatedFetch(url, { method: "GET" });
 		if (!response.ok) {
 			const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
 			throw new Error(errorData.detail || "Export failed");
 		}
 		const blob = await response.blob();
 		const blobUrl = URL.createObjectURL(blob);
 		const a = document.createElement("a");
 		a.href = blobUrl;
 		a.download = downloadName;
 		document.body.appendChild(a);
 		a.click();
 		document.body.removeChild(a);
 		URL.revokeObjectURL(blobUrl);
 	}, []);
 	const handleExportKB = useCallback(async () => {
 		if (isExportingKB) return;
 		if (pendingDocuments.length > 0) {
 			setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length });
 			setExportWarningOpen(true);
 			return;
 		}
 		setIsExportingKB(true);
 		try {
 			await doExport(
 				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
 				"knowledge-base.zip"
 			);
 			toast.success("Knowledge base exported");
 		} catch (err) {
 			console.error("KB export failed:", err);
 			toast.error(err instanceof Error ? err.message : "Export failed");
 		} finally {
 			setIsExportingKB(false);
 		}
 	}, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]);
 	const handleExportWarningConfirm = useCallback(async () => {
 		setExportWarningOpen(false);
 		const ctx = exportWarningContext;
 		if (!ctx) return;
 		if (ctx.type === "kb") {
 			setIsExportingKB(true);
 			try {
 				await doExport(
 					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
 					"knowledge-base.zip"
 				);
 				toast.success("Knowledge base exported");
 			} catch (err) {
 				console.error("KB export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
 			} finally {
 				setIsExportingKB(false);
 			}
 		} else if (ctx.type === "folder" && ctx.folder) {
 			setIsExportingKB(true);
 			try {
 				const safeName =
 					ctx.folder.name
 						.replace(/[^a-zA-Z0-9 _-]/g, "_")
 						.trim()
 						.slice(0, 80) || "folder";
 				await doExport(
 					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`,
 					`${safeName}.zip`
 				);
 				toast.success(`Folder "${ctx.folder.name}" exported`);
 			} catch (err) {
 				console.error("Folder export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
 			} finally {
 				setIsExportingKB(false);
 			}
 		}
 		setExportWarningContext(null);
 	}, [exportWarningContext, searchSpaceId, doExport]);
 	const getPendingCountInSubtree = useCallback(
 		(folderId: number): number => {
 			const subtreeIds = new Set<number>();
 			function collect(id: number) {
 				subtreeIds.add(id);
 				for (const child of foldersByParent[String(id)] ?? []) {
 					collect(child.id);
 				}
 			}
 			collect(folderId);
 			return treeDocuments.filter(
 				(d) =>
 					subtreeIds.has(d.folderId ?? -1) &&
 					(d.status?.state === "pending" || d.status?.state === "processing")
 			).length;
 		},
 		[foldersByParent, treeDocuments]
 	);
 	const handleExportFolder = useCallback(
 		async (folder: FolderDisplay) => {
 			const folderPendingCount = getPendingCountInSubtree(folder.id);
 			if (folderPendingCount > 0) {
 				setExportWarningContext({
 					type: "folder",
 					folder,
 					pendingCount: folderPendingCount,
 				});
 				setExportWarningOpen(true);
 				return;
 			}
 			setIsExportingKB(true);
 			try {
 				const safeName =
 					folder.name
 						.replace(/[^a-zA-Z0-9 _-]/g, "_")
 						.trim()
 						.slice(0, 80) || "folder";
 				await doExport(
 					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
 					`${safeName}.zip`
 				);
 				toast.success(`Folder "${folder.name}" exported`);
 			} catch (err) {
 				console.error("Folder export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
 			} finally {
 				setIsExportingKB(false);
 			}
 		},
 		[searchSpaceId, getPendingCountInSubtree, doExport]
 	);
 	const handleExportDocument = useCallback(
 		async (doc: DocumentNodeDoc, format: string) => {
 			const safeTitle =
@ -800,6 +954,8 @@ export function DocumentsSidebar({
 						onToggleType={onToggleType}
 						activeTypes={activeTypes}
 						onCreateFolder={() => handleCreateFolder(null)}
 						onExportKB={handleExportKB}
 						isExporting={isExportingKB}
 					/>
 				</div>
@ -855,6 +1011,7 @@ export function DocumentsSidebar({
 						watchedFolderIds={watchedFolderIds}
 						onRescanFolder={handleRescanFolder}
 						onStopWatchingFolder={handleStopWatching}
 						onExportFolder={handleExportFolder}
 					/>
 				</div>
 			</div>
@ -933,6 +1090,33 @@ export function DocumentsSidebar({
 					</AlertDialogFooter>
 				</AlertDialogContent>
 			</AlertDialog>
 			<AlertDialog
 				open={exportWarningOpen}
 				onOpenChange={(open) => {
 					if (!open) {
 						setExportWarningOpen(false);
 						setExportWarningContext(null);
 					}
 				}}
 			>
 				<AlertDialogContent>
 					<AlertDialogHeader>
 						<AlertDialogTitle>Some documents are still processing</AlertDialogTitle>
 						<AlertDialogDescription>
 							{exportWarningContext?.pendingCount} document
 							{exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed
 							and will be excluded from the export. Do you want to continue?
 						</AlertDialogDescription>
 					</AlertDialogHeader>
 					<AlertDialogFooter>
 						<AlertDialogCancel>Cancel</AlertDialogCancel>
 						<AlertDialogAction onClick={handleExportWarningConfirm}>
 							Export anyway
 						</AlertDialogAction>
 					</AlertDialogFooter>
 				</AlertDialogContent>
 			</AlertDialog>
 		</>
 	);
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress";
 import { Spinner } from "@/components/ui/spinner";
 import { Switch } from "@/components/ui/switch";
 import { useElectronAPI } from "@/hooks/use-platform";
 import { documentsApiService } from "@/lib/apis/documents-api.service";
 import {
 	trackDocumentUploadFailure,
 	trackDocumentUploadStarted,
@ -48,6 +49,77 @@ interface FileWithId {
 	file: File;
 }
 interface FolderEntry {
 	id: string;
 	file: File;
 	relativePath: string;
 }
 interface FolderUploadData {
 	folderName: string;
 	entries: FolderEntry[];
 }
 interface FolderTreeNode {
 	name: string;
 	isFolder: boolean;
 	size?: number;
 	children: FolderTreeNode[];
 }
 function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] {
 	const root: FolderTreeNode = { name: "", isFolder: true, children: [] };
 	for (const entry of entries) {
 		const parts = entry.relativePath.split("/");
 		let current = root;
 		for (let i = 0; i < parts.length - 1; i++) {
 			let child = current.children.find((c) => c.name === parts[i] && c.isFolder);
 			if (!child) {
 				child = { name: parts[i], isFolder: true, children: [] };
 				current.children.push(child);
 			}
 			current = child;
 		}
 		current.children.push({
 			name: parts[parts.length - 1],
 			isFolder: false,
 			size: entry.file.size,
 			children: [],
 		});
 	}
 	function sortNodes(node: FolderTreeNode) {
 		node.children.sort((a, b) => {
 			if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1;
 			return a.name.localeCompare(b.name);
 		});
 		for (const child of node.children) sortNodes(child);
 	}
 	sortNodes(root);
 	return root.children;
 }
 function flattenTree(
 	nodes: FolderTreeNode[],
 	depth = 0
 ): { name: string; isFolder: boolean; depth: number; size?: number }[] {
 	const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = [];
 	for (const node of nodes) {
 		items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size });
 		if (node.isFolder && node.children.length > 0) {
 			items.push(...flattenTree(node.children, depth + 1));
 		}
 	}
 	return items;
 }
 const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024;
 const FOLDER_BATCH_MAX_FILES = 10;
 const MAX_FILE_SIZE_MB = 500;
 const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
@ -64,11 +136,14 @@ export function DocumentUploadTab({
 	const [uploadProgress, setUploadProgress] = useState(0);
 	const [accordionValue, setAccordionValue] = useState<string>("");
 	const [shouldSummarize, setShouldSummarize] = useState(false);
 	const [useVisionLlm, setUseVisionLlm] = useState(false);
 	const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
 	const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
 	const fileInputRef = useRef<HTMLInputElement>(null);
 	const folderInputRef = useRef<HTMLInputElement>(null);
 	const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
 	const [folderUpload, setFolderUpload] = useState<FolderUploadData | null>(null);
 	const [isFolderUploading, setIsFolderUploading] = useState(false);
 	useEffect(() => {
 		return () => {
@ -105,6 +180,7 @@ export function DocumentUploadTab({
 			const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
 			if (valid.length === 0) return;
 			setFolderUpload(null);
 			setFiles((prev) => {
 				const newEntries = valid.map((f) => ({
 					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
@ -159,6 +235,7 @@ export function DocumentUploadTab({
 				file: new File([fd.data], fd.name, { type: fd.mimeType }),
 			})
 		);
 		setFolderUpload(null);
 		setFiles((prev) => [...prev, ...newFiles]);
 	}, [electronAPI, supportedExtensionsSet, t]);
@ -167,18 +244,35 @@ export function DocumentUploadTab({
 			const fileList = e.target.files;
 			if (!fileList || fileList.length === 0) return;
-			const folderFiles = Array.from(fileList).filter((f) => {
+			const allFiles = Array.from(fileList);
-				const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
+			const firstPath = allFiles[0]?.webkitRelativePath || "";
-				return ext !== "" && supportedExtensionsSet.has(ext);
+			const folderName = firstPath.split("/")[0];
 			});
-			if (folderFiles.length === 0) {
+			if (!folderName) {
 				addFiles(allFiles);
 				e.target.value = "";
 				return;
 			}
 			const entries: FolderEntry[] = allFiles
 				.filter((f) => {
 					const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
 					return ext !== "" && supportedExtensionsSet.has(ext);
 				})
 				.map((f) => ({
 					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
 					file: f,
 					relativePath: f.webkitRelativePath.substring(folderName.length + 1),
 				}));
 			if (entries.length === 0) {
 				toast.error(t("no_supported_files_in_folder"));
 				e.target.value = "";
 				return;
 			}
-			addFiles(folderFiles);
+			setFiles([]);
 			setFolderUpload({ folderName, entries });
 			e.target.value = "";
 		},
 		[addFiles, supportedExtensionsSet, t]
@ -192,9 +286,18 @@ export function DocumentUploadTab({
 		return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
 	};
-	const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
+	const totalFileSize = folderUpload
 		? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0)
 		: files.reduce((total, entry) => total + entry.file.size, 0);
-	const hasContent = files.length > 0;
+	const fileCount = folderUpload ? folderUpload.entries.length : files.length;
 	const hasContent = files.length > 0 || folderUpload !== null;
 	const isAnyUploading = isUploading || isFolderUploading;
 	const folderTreeItems = useMemo(() => {
 		if (!folderUpload) return [];
 		return flattenTree(buildFolderTree(folderUpload.entries));
 	}, [folderUpload]);
 	const handleAccordionChange = useCallback(
 		(value: string) => {
@ -204,7 +307,95 @@ export function DocumentUploadTab({
 		[onAccordionStateChange]
 	);
 	const handleFolderUpload = async () => {
 		if (!folderUpload) return;
 		setUploadProgress(0);
 		setIsFolderUploading(true);
 		const total = folderUpload.entries.length;
 		trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize);
 		try {
 			const batches: FolderEntry[][] = [];
 			let currentBatch: FolderEntry[] = [];
 			let currentSize = 0;
 			for (const entry of folderUpload.entries) {
 				const size = entry.file.size;
 				if (size >= FOLDER_BATCH_SIZE_BYTES) {
 					if (currentBatch.length > 0) {
 						batches.push(currentBatch);
 						currentBatch = [];
 						currentSize = 0;
 					}
 					batches.push([entry]);
 					continue;
 				}
 				if (
 					currentBatch.length >= FOLDER_BATCH_MAX_FILES ||
 					currentSize + size > FOLDER_BATCH_SIZE_BYTES
 				) {
 					batches.push(currentBatch);
 					currentBatch = [];
 					currentSize = 0;
 				}
 				currentBatch.push(entry);
 				currentSize += size;
 			}
 			if (currentBatch.length > 0) {
 				batches.push(currentBatch);
 			}
 			let rootFolderId: number | null = null;
 			let uploaded = 0;
 			for (const batch of batches) {
 				const result = await documentsApiService.folderUploadFiles(
 					batch.map((e) => e.file),
 					{
 						folder_name: folderUpload.folderName,
 						search_space_id: Number(searchSpaceId),
 						relative_paths: batch.map((e) => e.relativePath),
 						root_folder_id: rootFolderId,
 						enable_summary: shouldSummarize,
 						use_vision_llm: useVisionLlm,
 					}
 				);
 				if (result.root_folder_id && !rootFolderId) {
 					rootFolderId = result.root_folder_id;
 				}
 				uploaded += batch.length;
 				setUploadProgress(Math.round((uploaded / total) * 100));
 			}
 			trackDocumentUploadSuccess(Number(searchSpaceId), total);
 			toast(t("upload_initiated"), { description: t("upload_initiated_desc") });
 			setFolderUpload(null);
 			onSuccess?.();
 		} catch (error) {
 			const message = error instanceof Error ? error.message : "Upload failed";
 			trackDocumentUploadFailure(Number(searchSpaceId), message);
 			toast(t("upload_error"), {
 				description: `${t("upload_error_desc")}: ${message}`,
 			});
 		} finally {
 			setIsFolderUploading(false);
 			setUploadProgress(0);
 		}
 	};
 	const handleUpload = async () => {
 		if (folderUpload) {
 			await handleFolderUpload();
 			return;
 		}
 		setUploadProgress(0);
 		trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
@ -218,6 +409,7 @@ export function DocumentUploadTab({
 				files: rawFiles,
 				search_space_id: Number(searchSpaceId),
 				should_summarize: shouldSummarize,
 				use_vision_llm: useVisionLlm,
 			},
 			{
 				onSuccess: () => {
@ -341,28 +533,35 @@ export function DocumentUploadTab({
 						</button>
 					)
 				) : (
-					<button
+				<div
-						type="button"
+					role="button"
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
+					tabIndex={0}
-						onClick={() => {
+					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
 					onClick={() => {
 						if (!isElectron) fileInputRef.current?.click();
 					}}
 					onKeyDown={(e) => {
 						if (e.key === "Enter" || e.key === " ") {
 							e.preventDefault();
 							if (!isElectron) fileInputRef.current?.click();
-						}}
+						}
 					}}
 				>
 					<Upload className="h-10 w-10 text-muted-foreground" />
 					<div className="text-center space-y-1.5">
 						<p className="text-base font-medium">
 							{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
 						</p>
 						<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 					</div>
 					<fieldset
 						className="w-full mt-1 border-none p-0 m-0"
 						onClick={(e) => e.stopPropagation()}
 						onKeyDown={(e) => e.stopPropagation()}
 					>
-						<Upload className="h-10 w-10 text-muted-foreground" />
+						{renderBrowseButton({ fullWidth: true })}
-						<div className="text-center space-y-1.5">
+					</fieldset>
-							<p className="text-base font-medium">
+				</div>
 								{isElectron ? "Select files or folder" : "Tap to select files or folder"}
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
 						<fieldset
 							className="w-full mt-1 border-none p-0 m-0"
 							onClick={(e) => e.stopPropagation()}
 							onKeyDown={(e) => e.stopPropagation()}
 						>
 							{renderBrowseButton({ fullWidth: true })}
 						</fieldset>
 					</button>
 				)}
 			</div>
@ -398,55 +597,92 @@ export function DocumentUploadTab({
 			</div>
 			{/* FILES SELECTED */}
-			{files.length > 0 && (
+			{hasContent && (
 				<div className="rounded-lg border border-border p-3 space-y-2">
 					<div className="flex items-center justify-between">
 						<p className="text-sm font-medium">
-							{t("selected_files", { count: files.length })}
+							{folderUpload ? (
-							<Dot className="inline h-4 w-4" />
+								<>
-							{formatFileSize(totalFileSize)}
+									<FolderOpen className="inline h-4 w-4 mr-1 -mt-0.5" />
 									{folderUpload.folderName}
 									<Dot className="inline h-4 w-4" />
 									{folderUpload.entries.length}{" "}
 									{folderUpload.entries.length === 1 ? "file" : "files"}
 									<Dot className="inline h-4 w-4" />
 									{formatFileSize(totalFileSize)}
 								</>
 							) : (
 								<>
 									{t("selected_files", { count: files.length })}
 									<Dot className="inline h-4 w-4" />
 									{formatFileSize(totalFileSize)}
 								</>
 							)}
 						</p>
 						<Button
 							variant="ghost"
 							size="sm"
 							className="h-7 text-xs text-muted-foreground hover:text-foreground"
-							onClick={() => setFiles([])}
+							onClick={() => {
-							disabled={isUploading}
+								setFiles([]);
 								setFolderUpload(null);
 							}}
 							disabled={isAnyUploading}
 						>
 							{t("clear_all")}
 						</Button>
 					</div>
 					<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
-						{files.map((entry) => (
+						{folderUpload
-							<div
+							? folderTreeItems.map((item, i) => (
-								key={entry.id}
+									<div
-								className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
+										key={`${item.depth}-${i}-${item.name}`}
-							>
+										className="flex items-center gap-1.5 py-0.5 px-2"
-								<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
+										style={{ paddingLeft: `${item.depth * 16 + 8}px` }}
-									{entry.file.name.split(".").pop() || "?"}
+									>
-								</span>
+										{item.isFolder ? (
-								<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
+											<FolderOpen className="h-3.5 w-3.5 text-blue-400 shrink-0" />
-								<span className="text-xs text-muted-foreground shrink-0">
+										) : (
-									{formatFileSize(entry.file.size)}
+											<FileIcon className="h-3.5 w-3.5 text-muted-foreground shrink-0" />
-								</span>
+										)}
-								<Button
+										<span className="text-sm truncate flex-1 min-w-0">{item.name}</span>
-									variant="ghost"
+										{!item.isFolder && item.size != null && (
-									size="icon"
+											<span className="text-xs text-muted-foreground shrink-0">
-									className="h-6 w-6 shrink-0"
+												{formatFileSize(item.size)}
-									onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
+											</span>
-									disabled={isUploading}
+										)}
-								>
+									</div>
-									<X className="h-3 w-3" />
+								))
-								</Button>
+							: files.map((entry) => (
-							</div>
+									<div
-						))}
+										key={entry.id}
 										className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
 									>
 										<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
 											{entry.file.name.split(".").pop() || "?"}
 										</span>
 										<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
 										<span className="text-xs text-muted-foreground shrink-0">
 											{formatFileSize(entry.file.size)}
 										</span>
 										<Button
 											variant="ghost"
 											size="icon"
 											className="h-6 w-6 shrink-0"
 											onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
 											disabled={isAnyUploading}
 										>
 											<X className="h-3 w-3" />
 										</Button>
 									</div>
 								))}
 					</div>
-					{isUploading && (
+					{isAnyUploading && (
 						<div className="space-y-1">
 							<div className="flex items-center justify-between text-xs">
-								<span>{t("uploading_files")}</span>
+								<span>{folderUpload ? t("uploading_folder") : t("uploading_files")}</span>
 								<span>{Math.round(uploadProgress)}%</span>
 							</div>
 							<Progress value={uploadProgress} className="h-1.5" />
@ -463,19 +699,31 @@ export function DocumentUploadTab({
 						<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
 					</div>
 					<div className={toggleRowClass}>
 						<div className="space-y-0.5">
 							<p className="font-medium text-sm">Enable Vision LLM</p>
 							<p className="text-xs text-muted-foreground">
 								Describes images using AI vision (costly, slower)
 							</p>
 						</div>
 						<Switch checked={useVisionLlm} onCheckedChange={setUseVisionLlm} />
 					</div>
 					<Button
 						className="w-full"
 						onClick={handleUpload}
-						disabled={isUploading || files.length === 0}
+						disabled={isAnyUploading || fileCount === 0}
 					>
-						{isUploading ? (
+						{isAnyUploading ? (
 							<span className="flex items-center gap-2">
 								<Spinner size="sm" />
 								{t("uploading")}
 							</span>
 						) : (
 							<span className="flex items-center gap-2">
-								{t("upload_button", { count: files.length })}
+								{folderUpload
 									? t("upload_folder_button", { count: fileCount })
 									: t("upload_button", { count: fileCount })}
 							</span>
 						)}
 					</Button>
--- a/surfsense_web/contracts/types/connector.types.ts
+++ b/surfsense_web/contracts/types/connector.types.ts
@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({
 	last_indexed_at: z.string().nullable(),
 	config: z.record(z.string(), z.any()),
 	enable_summary: z.boolean().default(false),
 	enable_vision_llm: z.boolean().default(false),
 	periodic_indexing_enabled: z.boolean(),
 	indexing_frequency_minutes: z.number().nullable(),
 	next_scheduled_at: z.string().nullable(),
@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({
 		last_indexed_at: true,
 		config: true,
 		enable_summary: true,
 		enable_vision_llm: true,
 		periodic_indexing_enabled: true,
 		indexing_frequency_minutes: true,
 		next_scheduled_at: true,
@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({
 			last_indexed_at: true,
 			config: true,
 			enable_summary: true,
 			enable_vision_llm: true,
 			periodic_indexing_enabled: true,
 			indexing_frequency_minutes: true,
 			next_scheduled_at: true,
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@ -148,6 +148,7 @@ export const uploadDocumentRequest = z.object({
 	files: z.array(z.instanceof(File)),
 	search_space_id: z.number(),
 	should_summarize: z.boolean().default(false),
 	use_vision_llm: z.boolean().default(false),
 });
 export const uploadDocumentResponse = z.object({
--- a/surfsense_web/lib/apis/documents-api.service.ts
+++ b/surfsense_web/lib/apis/documents-api.service.ts
@ -127,7 +127,7 @@ class DocumentsApiService {
 			throw new ValidationError(`Invalid request: ${errorMessage}`);
 		}
-		const { files, search_space_id, should_summarize } = parsedRequest.data;
+		const { files, search_space_id, should_summarize, use_vision_llm } = parsedRequest.data;
 		const UPLOAD_BATCH_SIZE = 5;
 		const batches: File[][] = [];
@ -146,6 +146,7 @@ class DocumentsApiService {
 			for (const file of batch) formData.append("files", file);
 			formData.append("search_space_id", String(search_space_id));
 			formData.append("should_summarize", String(should_summarize));
 			formData.append("use_vision_llm", String(use_vision_llm));
 			const controller = new AbortController();
 			const timeoutId = setTimeout(() => controller.abort(), 120_000);
@ -442,6 +443,7 @@ class DocumentsApiService {
 			relative_paths: string[];
 			root_folder_id?: number | null;
 			enable_summary?: boolean;
 			use_vision_llm?: boolean;
 		},
 		signal?: AbortSignal
 	): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
@ -456,6 +458,7 @@ class DocumentsApiService {
 			formData.append("root_folder_id", String(metadata.root_folder_id));
 		}
 		formData.append("enable_summary", String(metadata.enable_summary ?? false));
 		formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false));
 		const totalSize = files.reduce((acc, f) => acc + f.size, 0);
 		const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@ -396,7 +396,11 @@
 		"supported_file_types": "Supported File Types",
 		"file_too_large": "File Too Large",
 		"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
-		"no_supported_files_in_folder": "No supported file types found in the selected folder."
+		"no_supported_files_in_folder": "No supported file types found in the selected folder.",
 		"uploading_folder": "Uploading folder…",
 		"upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})",
 		"select_files_or_folder": "Select files or folder",
 		"tap_select_files_or_folder": "Tap to select files or folder"
 	},
 	"add_webpage": {
 		"title": "Add Webpages for Crawling",
--- a/surfsense_web/messages/es.json
+++ b/surfsense_web/messages/es.json
@ -396,7 +396,11 @@
 		"supported_file_types": "Tipos de archivo soportados",
 		"file_too_large": "Archivo demasiado grande",
 		"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
-		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
+		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.",
 		"uploading_folder": "Subiendo carpeta…",
 		"upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})",
 		"select_files_or_folder": "Seleccionar archivos o carpeta",
 		"tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta"
 	},
 	"add_webpage": {
 		"title": "Agregar páginas web para rastreo",
--- a/surfsense_web/messages/hi.json
+++ b/surfsense_web/messages/hi.json
@ -396,7 +396,11 @@
 		"supported_file_types": "समर्थित फ़ाइल प्रकार",
 		"file_too_large": "फ़ाइल बहुत बड़ी है",
 		"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
-		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
+		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।",
 		"uploading_folder": "फ़ोल्डर अपलोड हो रहा है…",
 		"upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})",
 		"select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें",
 		"tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें"
 	},
 	"add_webpage": {
 		"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
--- a/surfsense_web/messages/pt.json
+++ b/surfsense_web/messages/pt.json
@ -396,7 +396,11 @@
 		"supported_file_types": "Tipos de arquivo suportados",
 		"file_too_large": "Arquivo muito grande",
 		"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
-		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
+		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.",
 		"uploading_folder": "Enviando pasta…",
 		"upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})",
 		"select_files_or_folder": "Selecionar arquivos ou pasta",
 		"tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta"
 	},
 	"add_webpage": {
 		"title": "Adicionar páginas web para rastreamento",
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@ -380,7 +380,11 @@
 		"supported_file_types": "支持的文件类型",
 		"file_too_large": "文件过大",
 		"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
-		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
+		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。",
 		"uploading_folder": "正在上传文件夹…",
 		"upload_folder_button": "上传文件夹（{count}个文件）",
 		"select_files_or_folder": "选择文件或文件夹",
 		"tap_select_files_or_folder": "点击选择文件或文件夹"
 	},
 	"add_webpage": {
 		"title": "添加网页爬取",