From 57ae2bd5afe085274b5fd6993b9190b992fc7a2e Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 11:18:56 +0200
Subject: [PATCH 01/29] feat: preserve folder structure on web folder upload

---
 .../components/sources/DocumentUploadTab.tsx  | 312 +++++++++++++++---
 1 file changed, 270 insertions(+), 42 deletions(-)
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 124354a49..5fc8e3fd3 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress";
 import { Spinner } from "@/components/ui/spinner";
 import { Switch } from "@/components/ui/switch";
 import { useElectronAPI } from "@/hooks/use-platform";
+import { documentsApiService } from "@/lib/apis/documents-api.service";
 import {
 	trackDocumentUploadFailure,
 	trackDocumentUploadStarted,
@@ -48,6 +49,77 @@ interface FileWithId {
 	file: File;
 }
 
+interface FolderEntry {
+	id: string;
+	file: File;
+	relativePath: string;
+}
+
+interface FolderUploadData {
+	folderName: string;
+	entries: FolderEntry[];
+}
+
+interface FolderTreeNode {
+	name: string;
+	isFolder: boolean;
+	size?: number;
+	children: FolderTreeNode[];
+}
+
+function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] {
+	const root: FolderTreeNode = { name: "", isFolder: true, children: [] };
+
+	for (const entry of entries) {
+		const parts = entry.relativePath.split("/");
+		let current = root;
+
+		for (let i = 0; i < parts.length - 1; i++) {
+			let child = current.children.find((c) => c.name === parts[i] && c.isFolder);
+			if (!child) {
+				child = { name: parts[i], isFolder: true, children: [] };
+				current.children.push(child);
+			}
+			current = child;
+		}
+
+		current.children.push({
+			name: parts[parts.length - 1],
+			isFolder: false,
+			size: entry.file.size,
+			children: [],
+		});
+	}
+
+	function sortNodes(node: FolderTreeNode) {
+		node.children.sort((a, b) => {
+			if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1;
+			return a.name.localeCompare(b.name);
+		});
+		for (const child of node.children) sortNodes(child);
+	}
+	sortNodes(root);
+
+	return root.children;
+}
+
+function flattenTree(
+	nodes: FolderTreeNode[],
+	depth = 0
+): { name: string; isFolder: boolean; depth: number; size?: number }[] {
+	const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = [];
+	for (const node of nodes) {
+		items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size });
+		if (node.isFolder && node.children.length > 0) {
+			items.push(...flattenTree(node.children, depth + 1));
+		}
+	}
+	return items;
+}
+
+const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024;
+const FOLDER_BATCH_MAX_FILES = 10;
+
 const MAX_FILE_SIZE_MB = 500;
 const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
 
@@ -69,6 +141,8 @@ export function DocumentUploadTab({
 	const fileInputRef = useRef<HTMLInputElement>(null);
 	const folderInputRef = useRef<HTMLInputElement>(null);
 	const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
+	const [folderUpload, setFolderUpload] = useState<FolderUploadData | null>(null);
+	const [isFolderUploading, setIsFolderUploading] = useState(false);
 
 	useEffect(() => {
 		return () => {
@@ -105,6 +179,7 @@ export function DocumentUploadTab({
 			const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
 			if (valid.length === 0) return;
 
+			setFolderUpload(null);
 			setFiles((prev) => {
 				const newEntries = valid.map((f) => ({
 					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
@@ -159,6 +234,7 @@ export function DocumentUploadTab({
 				file: new File([fd.data], fd.name, { type: fd.mimeType }),
 			})
 		);
+		setFolderUpload(null);
 		setFiles((prev) => [...prev, ...newFiles]);
 	}, [electronAPI, supportedExtensionsSet, t]);
 
@@ -167,18 +243,35 @@ export function DocumentUploadTab({
 			const fileList = e.target.files;
 			if (!fileList || fileList.length === 0) return;
 
-			const folderFiles = Array.from(fileList).filter((f) => {
-				const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
-				return ext !== "" && supportedExtensionsSet.has(ext);
-			});
+			const allFiles = Array.from(fileList);
+			const firstPath = allFiles[0]?.webkitRelativePath || "";
+			const folderName = firstPath.split("/")[0];
 
-			if (folderFiles.length === 0) {
+			if (!folderName) {
+				addFiles(allFiles);
+				e.target.value = "";
+				return;
+			}
+
+			const entries: FolderEntry[] = allFiles
+				.filter((f) => {
+					const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
+					return ext !== "" && supportedExtensionsSet.has(ext);
+				})
+				.map((f) => ({
+					id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
+					file: f,
+					relativePath: f.webkitRelativePath.substring(folderName.length + 1),
+				}));
+
+			if (entries.length === 0) {
 				toast.error(t("no_supported_files_in_folder"));
 				e.target.value = "";
 				return;
 			}
 
-			addFiles(folderFiles);
+			setFiles([]);
+			setFolderUpload({ folderName, entries });
 			e.target.value = "";
 		},
 		[addFiles, supportedExtensionsSet, t]
@@ -192,9 +285,18 @@ export function DocumentUploadTab({
 		return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
 	};
 
-	const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
+	const totalFileSize = folderUpload
+		? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0)
+		: files.reduce((total, entry) => total + entry.file.size, 0);
 
-	const hasContent = files.length > 0;
+	const fileCount = folderUpload ? folderUpload.entries.length : files.length;
+	const hasContent = files.length > 0 || folderUpload !== null;
+	const isAnyUploading = isUploading || isFolderUploading;
+
+	const folderTreeItems = useMemo(() => {
+		if (!folderUpload) return [];
+		return flattenTree(buildFolderTree(folderUpload.entries));
+	}, [folderUpload]);
 
 	const handleAccordionChange = useCallback(
 		(value: string) => {
@@ -204,7 +306,94 @@ export function DocumentUploadTab({
 		[onAccordionStateChange]
 	);
 
+	const handleFolderUpload = async () => {
+		if (!folderUpload) return;
+
+		setUploadProgress(0);
+		setIsFolderUploading(true);
+		const total = folderUpload.entries.length;
+		trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize);
+
+		try {
+			const batches: FolderEntry[][] = [];
+			let currentBatch: FolderEntry[] = [];
+			let currentSize = 0;
+
+			for (const entry of folderUpload.entries) {
+				const size = entry.file.size;
+
+				if (size >= FOLDER_BATCH_SIZE_BYTES) {
+					if (currentBatch.length > 0) {
+						batches.push(currentBatch);
+						currentBatch = [];
+						currentSize = 0;
+					}
+					batches.push([entry]);
+					continue;
+				}
+
+				if (
+					currentBatch.length >= FOLDER_BATCH_MAX_FILES ||
+					currentSize + size > FOLDER_BATCH_SIZE_BYTES
+				) {
+					batches.push(currentBatch);
+					currentBatch = [];
+					currentSize = 0;
+				}
+
+				currentBatch.push(entry);
+				currentSize += size;
+			}
+
+			if (currentBatch.length > 0) {
+				batches.push(currentBatch);
+			}
+
+			let rootFolderId: number | null = null;
+			let uploaded = 0;
+
+			for (const batch of batches) {
+				const result = await documentsApiService.folderUploadFiles(
+					batch.map((e) => e.file),
+					{
+						folder_name: folderUpload.folderName,
+						search_space_id: Number(searchSpaceId),
+						relative_paths: batch.map((e) => e.relativePath),
+						root_folder_id: rootFolderId,
+						enable_summary: shouldSummarize,
+					}
+				);
+
+				if (result.root_folder_id && !rootFolderId) {
+					rootFolderId = result.root_folder_id;
+				}
+
+				uploaded += batch.length;
+				setUploadProgress(Math.round((uploaded / total) * 100));
+			}
+
+			trackDocumentUploadSuccess(Number(searchSpaceId), total);
+			toast(t("upload_initiated"), { description: t("upload_initiated_desc") });
+			setFolderUpload(null);
+			onSuccess?.();
+		} catch (error) {
+			const message = error instanceof Error ? error.message : "Upload failed";
+			trackDocumentUploadFailure(Number(searchSpaceId), message);
+			toast(t("upload_error"), {
+				description: `${t("upload_error_desc")}: ${message}`,
+			});
+		} finally {
+			setIsFolderUploading(false);
+			setUploadProgress(0);
+		}
+	};
+
 	const handleUpload = async () => {
+		if (folderUpload) {
+			await handleFolderUpload();
+			return;
+		}
+
 		setUploadProgress(0);
 		trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
 
@@ -398,55 +587,92 @@ export function DocumentUploadTab({
 			</div>
 
 			{/* FILES SELECTED */}
-			{files.length > 0 && (
+			{hasContent && (
 				<div className="rounded-lg border border-border p-3 space-y-2">
 					<div className="flex items-center justify-between">
 						<p className="text-sm font-medium">
-							{t("selected_files", { count: files.length })}
-							<Dot className="inline h-4 w-4" />
-							{formatFileSize(totalFileSize)}
+							{folderUpload ? (
+								<>
+									<FolderOpen className="inline h-4 w-4 mr-1 -mt-0.5" />
+									{folderUpload.folderName}
+									<Dot className="inline h-4 w-4" />
+									{folderUpload.entries.length}{" "}
+									{folderUpload.entries.length === 1 ? "file" : "files"}
+									<Dot className="inline h-4 w-4" />
+									{formatFileSize(totalFileSize)}
+								</>
+							) : (
+								<>
+									{t("selected_files", { count: files.length })}
+									<Dot className="inline h-4 w-4" />
+									{formatFileSize(totalFileSize)}
+								</>
+							)}
 						</p>
 						<Button
 							variant="ghost"
 							size="sm"
 							className="h-7 text-xs text-muted-foreground hover:text-foreground"
-							onClick={() => setFiles([])}
-							disabled={isUploading}
+							onClick={() => {
+								setFiles([]);
+								setFolderUpload(null);
+							}}
+							disabled={isAnyUploading}
 						>
 							{t("clear_all")}
 						</Button>
 					</div>
 
 					<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
-						{files.map((entry) => (
-							<div
-								key={entry.id}
-								className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
-							>
-								<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
-									{entry.file.name.split(".").pop() || "?"}
-								</span>
-								<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
-								<span className="text-xs text-muted-foreground shrink-0">
-									{formatFileSize(entry.file.size)}
-								</span>
-								<Button
-									variant="ghost"
-									size="icon"
-									className="h-6 w-6 shrink-0"
-									onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
-									disabled={isUploading}
-								>
-									<X className="h-3 w-3" />
-								</Button>
-							</div>
-						))}
+						{folderUpload
+							? folderTreeItems.map((item, i) => (
+									<div
+										key={`${item.depth}-${i}-${item.name}`}
+										className="flex items-center gap-1.5 py-0.5 px-2"
+										style={{ paddingLeft: `${item.depth * 16 + 8}px` }}
+									>
+										{item.isFolder ? (
+											<FolderOpen className="h-3.5 w-3.5 text-blue-400 shrink-0" />
+										) : (
+											<FileIcon className="h-3.5 w-3.5 text-muted-foreground shrink-0" />
+										)}
+										<span className="text-sm truncate flex-1 min-w-0">{item.name}</span>
+										{!item.isFolder && item.size != null && (
+											<span className="text-xs text-muted-foreground shrink-0">
+												{formatFileSize(item.size)}
+											</span>
+										)}
+									</div>
+								))
+							: files.map((entry) => (
+									<div
+										key={entry.id}
+										className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
+									>
+										<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
+											{entry.file.name.split(".").pop() || "?"}
+										</span>
+										<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
+										<span className="text-xs text-muted-foreground shrink-0">
+											{formatFileSize(entry.file.size)}
+										</span>
+										<Button
+											variant="ghost"
+											size="icon"
+											className="h-6 w-6 shrink-0"
+											onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
+											disabled={isAnyUploading}
+										>
+											<X className="h-3 w-3" />
+										</Button>
+									</div>
+								))}
 					</div>
 
-					{isUploading && (
+					{isAnyUploading && (
 						<div className="space-y-1">
 							<div className="flex items-center justify-between text-xs">
-								<span>{t("uploading_files")}</span>
+								<span>{folderUpload ? "Uploading folder…" : t("uploading_files")}</span>
 								<span>{Math.round(uploadProgress)}%</span>
 							</div>
 							<Progress value={uploadProgress} className="h-1.5" />
@@ -466,16 +692,18 @@ export function DocumentUploadTab({
 					<Button
 						className="w-full"
 						onClick={handleUpload}
-						disabled={isUploading || files.length === 0}
+						disabled={isAnyUploading || fileCount === 0}
 					>
-						{isUploading ? (
+						{isAnyUploading ? (
 							<span className="flex items-center gap-2">
 								<Spinner size="sm" />
 								{t("uploading")}
 							</span>
 						) : (
 							<span className="flex items-center gap-2">
-								{t("upload_button", { count: files.length })}
+								{folderUpload
+									? `Upload Folder (${fileCount} files)`
+									: t("upload_button", { count: fileCount })}
 							</span>
 						)}
 					</Button>

From b1fa1279b19886e2a012195fe9195b12c6745cc4 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 12:09:34 +0200
Subject: [PATCH 02/29] feat: add export KB button in documents toolbar

---
 .../components/documents/DocumentsFilters.tsx | 22 ++++++++++++++++++-
 .../layout/ui/sidebar/DocumentsSidebar.tsx    |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx
index a795b61c7..abd65637c 100644
--- a/surfsense_web/components/documents/DocumentsFilters.tsx
+++ b/surfsense_web/components/documents/DocumentsFilters.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
+import { Download, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
 import { useTranslations } from "next-intl";
 import React, { useCallback, useMemo, useRef, useState } from "react";
 import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
@@ -20,6 +20,7 @@ export function DocumentsFilters({
 	onToggleType,
 	activeTypes,
 	onCreateFolder,
+	onExportKB,
 }: {
 	typeCounts: Partial<Record<DocumentTypeEnum, number>>;
 	onSearch: (v: string) => void;
@@ -27,6 +28,7 @@ export function DocumentsFilters({
 	onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
 	activeTypes: DocumentTypeEnum[];
 	onCreateFolder?: () => void;
+	onExportKB?: () => void;
 }) {
 	const t = useTranslations("documents");
 	const id = React.useId();
@@ -84,6 +86,24 @@ export function DocumentsFilters({
 						</Tooltip>
 					)}
 
+					{onExportKB && (
+						<Tooltip>
+							<TooltipTrigger asChild>
+								<ToggleGroupItem
+									value="export"
+									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
+									onClick={(e) => {
+										e.preventDefault();
+										onExportKB();
+									}}
+								>
+									<Download size={14} />
+								</ToggleGroupItem>
+							</TooltipTrigger>
+							<TooltipContent>Export knowledge base</TooltipContent>
+						</Tooltip>
+					)}
+
 					<Popover>
 						<Tooltip>
 							<TooltipTrigger asChild>
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index 8b3a119ae..db80c8d8d 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -800,6 +800,7 @@ export function DocumentsSidebar({
 						onToggleType={onToggleType}
 						activeTypes={activeTypes}
 						onCreateFolder={() => handleCreateFolder(null)}
+						onExportKB={() => toast("Export KB clicked (placeholder)")}
 					/>
 				</div>
 

From 473eece89680bc740fcafc1c9d491402a5f5cfb9 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 12:10:37 +0200
Subject: [PATCH 03/29] feat: add export route skeleton

---
 surfsense_backend/app/routes/__init__.py      |  2 +
 surfsense_backend/app/routes/export_routes.py | 38 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 surfsense_backend/app/routes/export_routes.py

diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py
index 02367606b..443b8cc93 100644
--- a/surfsense_backend/app/routes/__init__.py
+++ b/surfsense_backend/app/routes/__init__.py
@@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router
 from .documents_routes import router as documents_router
 from .dropbox_add_connector_route import router as dropbox_add_connector_router
 from .editor_routes import router as editor_router
+from .export_routes import router as export_router
 from .folders_routes import router as folders_router
 from .google_calendar_add_connector_route import (
     router as google_calendar_add_connector_router,
@@ -57,6 +58,7 @@ router = APIRouter()
 router.include_router(search_spaces_router)
 router.include_router(rbac_router)  # RBAC routes for roles, members, invites
 router.include_router(editor_router)
+router.include_router(export_router)
 router.include_router(documents_router)
 router.include_router(folders_router)
 router.include_router(notes_router)
diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py
new file mode 100644
index 000000000..0bc5b4d1c
--- /dev/null
+++ b/surfsense_backend/app/routes/export_routes.py
@@ -0,0 +1,38 @@
+"""Routes for exporting knowledge base content as ZIP."""
+
+import logging
+
+from fastapi import APIRouter, Depends, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Permission, User, get_async_session
+from app.users import current_active_user
+from app.utils.rbac import check_permission
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.get("/search-spaces/{search_space_id}/export")
+async def export_knowledge_base(
+    search_space_id: int,
+    folder_id: int | None = Query(None, description="Export only this folder's subtree"),
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """Export documents as a ZIP of markdown files preserving folder structure.
+
+    If folder_id is provided, only that folder's subtree is exported.
+    Otherwise, the entire search space is exported.
+    """
+    await check_permission(
+        session,
+        user,
+        search_space_id,
+        Permission.DOCUMENTS_READ.value,
+        "You don't have permission to export documents in this search space",
+    )
+
+    # TODO: implement export logic
+    return {"message": "Export endpoint placeholder"}

From 47f1d7e37359d437c12e55843211888dfb6c2327 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 12:17:43 +0200
Subject: [PATCH 04/29] feat: implement KB export as ZIP with service layer

---
 surfsense_backend/app/routes/export_routes.py |  39 +++-
 .../app/services/export_service.py            | 168 ++++++++++++++++++
 2 files changed, 199 insertions(+), 8 deletions(-)
 create mode 100644 surfsense_backend/app/services/export_service.py

diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py
index 0bc5b4d1c..641c7fedb 100644
--- a/surfsense_backend/app/routes/export_routes.py
+++ b/surfsense_backend/app/routes/export_routes.py
@@ -1,11 +1,14 @@
 """Routes for exporting knowledge base content as ZIP."""
 
 import logging
+import os
 
-from fastapi import APIRouter, Depends, Query
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.db import Permission, User, get_async_session
+from app.services.export_service import build_export_zip
 from app.users import current_active_user
 from app.utils.rbac import check_permission
 
@@ -21,11 +24,7 @@ async def export_knowledge_base(
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
-    """Export documents as a ZIP of markdown files preserving folder structure.
-
-    If folder_id is provided, only that folder's subtree is exported.
-    Otherwise, the entire search space is exported.
-    """
+    """Export documents as a ZIP of markdown files preserving folder structure."""
     await check_permission(
         session,
         user,
@@ -34,5 +33,29 @@ async def export_knowledge_base(
         "You don't have permission to export documents in this search space",
     )
 
-    # TODO: implement export logic
-    return {"message": "Export endpoint placeholder"}
+    try:
+        result = await build_export_zip(session, search_space_id, folder_id)
+    except ValueError as e:
+        raise HTTPException(status_code=404, detail=str(e)) from None
+
+    def stream_and_cleanup():
+        try:
+            with open(result.zip_path, "rb") as f:
+                while chunk := f.read(8192):
+                    yield chunk
+        finally:
+            os.unlink(result.zip_path)
+
+    headers = {
+        "Content-Disposition": f'attachment; filename="{result.export_name}.zip"',
+        "Content-Length": str(result.zip_size),
+    }
+
+    if result.skipped_docs:
+        headers["X-Skipped-Documents"] = str(len(result.skipped_docs))
+
+    return StreamingResponse(
+        stream_and_cleanup(),
+        media_type="application/zip",
+        headers=headers,
+    )
diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py
new file mode 100644
index 000000000..49f1a127a
--- /dev/null
+++ b/surfsense_backend/app/services/export_service.py
@@ -0,0 +1,168 @@
+"""Service for exporting knowledge base content as a ZIP archive."""
+
+import logging
+import os
+import tempfile
+import zipfile
+from dataclasses import dataclass, field
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.db import Chunk, Document, Folder
+from app.services.folder_service import get_folder_subtree_ids
+
+logger = logging.getLogger(__name__)
+
+
+def _sanitize_filename(title: str) -> str:
+    safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip()
+    return safe[:80] or "document"
+
+
+def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
+    """Build a mapping of folder_id -> full path string (e.g. 'Research/AI')."""
+    id_to_folder = {f.id: f for f in folders}
+    cache: dict[int, str] = {}
+
+    def resolve(folder_id: int) -> str:
+        if folder_id in cache:
+            return cache[folder_id]
+        folder = id_to_folder[folder_id]
+        if folder.parent_id is None or folder.parent_id not in id_to_folder:
+            cache[folder_id] = folder.name
+        else:
+            cache[folder_id] = f"{resolve(folder.parent_id)}/{folder.name}"
+        return cache[folder_id]
+
+    for f in folders:
+        resolve(f.id)
+
+    return cache
+
+
+async def _get_document_markdown(
+    session: AsyncSession, document: Document
+) -> str | None:
+    """Resolve markdown content using the 3-tier fallback:
+    1. source_markdown  2. blocknote_document conversion  3. chunk concatenation
+    """
+    if document.source_markdown is not None:
+        return document.source_markdown
+
+    if document.blocknote_document:
+        from app.utils.blocknote_to_markdown import blocknote_to_markdown
+
+        md = blocknote_to_markdown(document.blocknote_document)
+        if md:
+            return md
+
+    chunk_result = await session.execute(
+        select(Chunk.content)
+        .filter(Chunk.document_id == document.id)
+        .order_by(Chunk.id)
+    )
+    chunks = chunk_result.scalars().all()
+    if chunks:
+        return "\n\n".join(chunks)
+
+    return None
+
+
+@dataclass
+class ExportResult:
+    zip_path: str
+    export_name: str
+    zip_size: int
+    skipped_docs: list[str] = field(default_factory=list)
+
+
+async def build_export_zip(
+    session: AsyncSession,
+    search_space_id: int,
+    folder_id: int | None = None,
+) -> ExportResult:
+    """Build a ZIP archive of markdown documents preserving folder structure.
+
+    Returns an ExportResult with the path to the temp ZIP file.
+    The caller is responsible for streaming and cleaning up the file.
+
+    Raises ValueError if folder_id is provided but not found.
+    """
+    if folder_id is not None:
+        folder = await session.get(Folder, folder_id)
+        if not folder or folder.search_space_id != search_space_id:
+            raise ValueError("Folder not found")
+        target_folder_ids = set(await get_folder_subtree_ids(session, folder_id))
+    else:
+        target_folder_ids = None
+
+    folder_query = select(Folder).where(Folder.search_space_id == search_space_id)
+    if target_folder_ids is not None:
+        folder_query = folder_query.where(Folder.id.in_(target_folder_ids))
+    folder_result = await session.execute(folder_query)
+    folders = list(folder_result.scalars().all())
+
+    folder_path_map = _build_folder_path_map(folders)
+
+    doc_query = select(Document).where(Document.search_space_id == search_space_id)
+    if target_folder_ids is not None:
+        doc_query = doc_query.where(Document.folder_id.in_(target_folder_ids))
+    doc_result = await session.execute(doc_query)
+    documents = list(doc_result.scalars().all())
+
+    fd, tmp_path = tempfile.mkstemp(suffix=".zip")
+    os.close(fd)
+
+    try:
+        used_paths: dict[str, int] = {}
+        skipped_docs: list[str] = []
+
+        with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zf:
+            for doc in documents:
+                status = doc.status or {}
+                state = status.get("state", "ready") if isinstance(status, dict) else "ready"
+                if state in ("pending", "processing"):
+                    skipped_docs.append(doc.title or "Untitled")
+                    continue
+
+                markdown = await _get_document_markdown(session, doc)
+                if not markdown or not markdown.strip():
+                    continue
+
+                if doc.folder_id and doc.folder_id in folder_path_map:
+                    dir_path = folder_path_map[doc.folder_id]
+                else:
+                    dir_path = ""
+
+                base_name = _sanitize_filename(doc.title or "Untitled")
+                file_path = f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
+
+                if file_path in used_paths:
+                    used_paths[file_path] += 1
+                    suffix = used_paths[file_path]
+                    file_path = (
+                        f"{dir_path}/{base_name}_{suffix}.md"
+                        if dir_path
+                        else f"{base_name}_{suffix}.md"
+                    )
+                else:
+                    used_paths[file_path] = 1
+
+                zf.writestr(file_path, markdown)
+
+        export_name = "knowledge-base"
+        if folder_id is not None and folder_id in folder_path_map:
+            export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0])
+
+        return ExportResult(
+            zip_path=tmp_path,
+            export_name=export_name,
+            zip_size=os.path.getsize(tmp_path),
+            skipped_docs=skipped_docs,
+        )
+
+    except Exception:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise

From c38239a9953375b867da70dc01f3dbd7676edd6b Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 12:19:04 +0200
Subject: [PATCH 05/29] feat: wire KB export button in sidebar

---
 .../layout/ui/sidebar/DocumentsSidebar.tsx    | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index db80c8d8d..0f925af33 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -406,6 +406,45 @@ export function DocumentsSidebar({
 		setFolderPickerOpen(true);
 	}, []);
 
+	const [isExportingKB, setIsExportingKB] = useState(false);
+
+	const handleExportKB = useCallback(async () => {
+		if (isExportingKB) return;
+		setIsExportingKB(true);
+		try {
+			const response = await authenticatedFetch(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
+				{ method: "GET" }
+			);
+			if (!response.ok) {
+				const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
+				throw new Error(errorData.detail || "Export failed");
+			}
+
+			const skipped = response.headers.get("X-Skipped-Documents");
+			if (skipped && Number(skipped) > 0) {
+				toast.warning(`${skipped} document(s) were skipped (still processing)`);
+			}
+
+			const blob = await response.blob();
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement("a");
+			a.href = url;
+			a.download = "knowledge-base.zip";
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+
+			toast.success("Knowledge base exported");
+		} catch (err) {
+			console.error("KB export failed:", err);
+			toast.error(err instanceof Error ? err.message : "Export failed");
+		} finally {
+			setIsExportingKB(false);
+		}
+	}, [searchSpaceId, isExportingKB]);
+
 	const handleExportDocument = useCallback(
 		async (doc: DocumentNodeDoc, format: string) => {
 			const safeTitle =
@@ -800,7 +839,7 @@ export function DocumentsSidebar({
 						onToggleType={onToggleType}
 						activeTypes={activeTypes}
 						onCreateFolder={() => handleCreateFolder(null)}
-						onExportKB={() => toast("Export KB clicked (placeholder)")}
+						onExportKB={handleExportKB}
 					/>
 				</div>
 

From 89f210bf7e99820319aa24053bbc142638f10a96 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 12:20:49 +0200
Subject: [PATCH 06/29] feat: add folder-level export to context menu

---
 .../components/documents/FolderNode.tsx       | 20 +++++++++
 .../components/documents/FolderTreeView.tsx   |  3 ++
 .../layout/ui/sidebar/DocumentsSidebar.tsx    | 42 +++++++++++++++++++
 3 files changed, 65 insertions(+)

diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx
index 7f75f8abf..a1b437983 100644
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@@ -4,6 +4,7 @@ import {
 	AlertCircle,
 	ChevronDown,
 	ChevronRight,
+	Download,
 	Eye,
 	EyeOff,
 	Folder,
@@ -80,6 +81,7 @@ interface FolderNodeProps {
 	isWatched?: boolean;
 	onRescan?: (folder: FolderDisplay) => void | Promise<void>;
 	onStopWatching?: (folder: FolderDisplay) => void;
+	onExportFolder?: (folder: FolderDisplay) => void;
 }
 
 function getDropZone(
@@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({
 	isWatched,
 	onRescan,
 	onStopWatching,
+	onExportFolder,
 }: FolderNodeProps) {
 	const [renameValue, setRenameValue] = useState(folder.name);
 	const inputRef = useRef<HTMLInputElement>(null);
@@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({
 									<Move className="mr-2 h-4 w-4" />
 									Move to...
 								</DropdownMenuItem>
+								{onExportFolder && (
+									<DropdownMenuItem
+										onClick={(e) => {
+											e.stopPropagation();
+											onExportFolder(folder);
+										}}
+									>
+										<Download className="mr-2 h-4 w-4" />
+										Export folder
+									</DropdownMenuItem>
+								)}
 								<DropdownMenuItem
 									onClick={(e) => {
 										e.stopPropagation();
@@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({
 						<Move className="mr-2 h-4 w-4" />
 						Move to...
 					</ContextMenuItem>
+					{onExportFolder && (
+						<ContextMenuItem onClick={() => onExportFolder(folder)}>
+							<Download className="mr-2 h-4 w-4" />
+							Export folder
+						</ContextMenuItem>
+					)}
 					<ContextMenuItem onClick={() => onDelete(folder)}>
 						<Trash2 className="mr-2 h-4 w-4" />
 						Delete
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index 6eb53da50..4988e87e7 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -44,6 +44,7 @@ interface FolderTreeViewProps {
 	watchedFolderIds?: Set<number>;
 	onRescanFolder?: (folder: FolderDisplay) => void;
 	onStopWatchingFolder?: (folder: FolderDisplay) => void;
+	onExportFolder?: (folder: FolderDisplay) => void;
 }
 
 function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
@@ -81,6 +82,7 @@ export function FolderTreeView({
 	watchedFolderIds,
 	onRescanFolder,
 	onStopWatchingFolder,
+	onExportFolder,
 }: FolderTreeViewProps) {
 	const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
 
@@ -259,6 +261,7 @@ export function FolderTreeView({
 					isWatched={watchedFolderIds?.has(f.id)}
 					onRescan={onRescanFolder}
 					onStopWatching={onStopWatchingFolder}
+					onExportFolder={onExportFolder}
 				/>
 			);
 
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index 0f925af33..853aea641 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -445,6 +445,47 @@ export function DocumentsSidebar({
 		}
 	}, [searchSpaceId, isExportingKB]);
 
+	const handleExportFolder = useCallback(
+		async (folder: FolderDisplay) => {
+			try {
+				const response = await authenticatedFetch(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
+					{ method: "GET" }
+				);
+				if (!response.ok) {
+					const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
+					throw new Error(errorData.detail || "Export failed");
+				}
+
+				const skipped = response.headers.get("X-Skipped-Documents");
+				if (skipped && Number(skipped) > 0) {
+					toast.warning(`${skipped} document(s) were skipped (still processing)`);
+				}
+
+				const blob = await response.blob();
+				const safeName =
+					folder.name
+						.replace(/[^a-zA-Z0-9 _-]/g, "_")
+						.trim()
+						.slice(0, 80) || "folder";
+				const url = URL.createObjectURL(blob);
+				const a = document.createElement("a");
+				a.href = url;
+				a.download = `${safeName}.zip`;
+				document.body.appendChild(a);
+				a.click();
+				document.body.removeChild(a);
+				URL.revokeObjectURL(url);
+
+				toast.success(`Folder "${folder.name}" exported`);
+			} catch (err) {
+				console.error("Folder export failed:", err);
+				toast.error(err instanceof Error ? err.message : "Export failed");
+			}
+		},
+		[searchSpaceId]
+	);
+
 	const handleExportDocument = useCallback(
 		async (doc: DocumentNodeDoc, format: string) => {
 			const safeTitle =
@@ -895,6 +936,7 @@ export function DocumentsSidebar({
 						watchedFolderIds={watchedFolderIds}
 						onRescanFolder={handleRescanFolder}
 						onStopWatchingFolder={handleStopWatching}
+						onExportFolder={handleExportFolder}
 					/>
 				</div>
 			</div>

From e7107b751dd562eb64dc0f5535219210e662f4ff Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:01:23 +0200
Subject: [PATCH 07/29] fix: strip folder prefix from filename in folder upload

---
 surfsense_backend/app/routes/documents_routes.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 53312c647..8093084f0 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1565,7 +1565,8 @@ async def folder_upload(
 
     async def _read_and_save(file: UploadFile, idx: int) -> dict:
         content = await file.read()
-        filename = file.filename or rel_paths[idx].split("/")[-1]
+        raw_name = file.filename or rel_paths[idx]
+        filename = raw_name.split("/")[-1]
 
         def _write_temp() -> str:
             with tempfile.NamedTemporaryFile(

From 7a7792fc799756b3555aaa943c5462b1629c3fe0 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:11:43 +0200
Subject: [PATCH 08/29] feat: warn before export when documents are processing

---
 .../layout/ui/sidebar/DocumentsSidebar.tsx    | 171 +++++++++++++-----
 1 file changed, 123 insertions(+), 48 deletions(-)

diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index 853aea641..ef25d3056 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -407,35 +407,54 @@ export function DocumentsSidebar({
 	}, []);
 
 	const [isExportingKB, setIsExportingKB] = useState(false);
+	const [exportWarningOpen, setExportWarningOpen] = useState(false);
+	const [exportWarningContext, setExportWarningContext] = useState<{
+		type: "kb" | "folder";
+		folder?: FolderDisplay;
+		pendingCount: number;
+	} | null>(null);
+
+	const pendingDocuments = useMemo(
+		() =>
+			treeDocuments.filter(
+				(d) => d.status?.state === "pending" || d.status?.state === "processing"
+			),
+		[treeDocuments]
+	);
+
+	const doExport = useCallback(async (url: string, downloadName: string) => {
+		const response = await authenticatedFetch(url, { method: "GET" });
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
+			throw new Error(errorData.detail || "Export failed");
+		}
+
+		const blob = await response.blob();
+		const blobUrl = URL.createObjectURL(blob);
+		const a = document.createElement("a");
+		a.href = blobUrl;
+		a.download = downloadName;
+		document.body.appendChild(a);
+		a.click();
+		document.body.removeChild(a);
+		URL.revokeObjectURL(blobUrl);
+	}, []);
 
 	const handleExportKB = useCallback(async () => {
 		if (isExportingKB) return;
+
+		if (pendingDocuments.length > 0) {
+			setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length });
+			setExportWarningOpen(true);
+			return;
+		}
+
 		setIsExportingKB(true);
 		try {
-			const response = await authenticatedFetch(
+			await doExport(
 				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
-				{ method: "GET" }
+				"knowledge-base.zip"
 			);
-			if (!response.ok) {
-				const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
-				throw new Error(errorData.detail || "Export failed");
-			}
-
-			const skipped = response.headers.get("X-Skipped-Documents");
-			if (skipped && Number(skipped) > 0) {
-				toast.warning(`${skipped} document(s) were skipped (still processing)`);
-			}
-
-			const blob = await response.blob();
-			const url = URL.createObjectURL(blob);
-			const a = document.createElement("a");
-			a.href = url;
-			a.download = "knowledge-base.zip";
-			document.body.appendChild(a);
-			a.click();
-			document.body.removeChild(a);
-			URL.revokeObjectURL(url);
-
 			toast.success("Knowledge base exported");
 		} catch (err) {
 			console.error("KB export failed:", err);
@@ -443,47 +462,76 @@ export function DocumentsSidebar({
 		} finally {
 			setIsExportingKB(false);
 		}
-	}, [searchSpaceId, isExportingKB]);
+	}, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]);
+
+	const handleExportWarningConfirm = useCallback(async () => {
+		setExportWarningOpen(false);
+		const ctx = exportWarningContext;
+		if (!ctx) return;
+
+		if (ctx.type === "kb") {
+			setIsExportingKB(true);
+			try {
+				await doExport(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
+					"knowledge-base.zip"
+				);
+				toast.success("Knowledge base exported");
+			} catch (err) {
+				console.error("KB export failed:", err);
+				toast.error(err instanceof Error ? err.message : "Export failed");
+			} finally {
+				setIsExportingKB(false);
+			}
+		} else if (ctx.type === "folder" && ctx.folder) {
+			try {
+				const safeName =
+					ctx.folder.name
+						.replace(/[^a-zA-Z0-9 _-]/g, "_")
+						.trim()
+						.slice(0, 80) || "folder";
+				await doExport(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`,
+					`${safeName}.zip`
+				);
+				toast.success(`Folder "${ctx.folder.name}" exported`);
+			} catch (err) {
+				console.error("Folder export failed:", err);
+				toast.error(err instanceof Error ? err.message : "Export failed");
+			}
+		}
+		setExportWarningContext(null);
+	}, [exportWarningContext, searchSpaceId, doExport]);
 
 	const handleExportFolder = useCallback(
 		async (folder: FolderDisplay) => {
+			if (pendingDocuments.length > 0) {
+				setExportWarningContext({
+					type: "folder",
+					folder,
+					pendingCount: pendingDocuments.length,
+				});
+				setExportWarningOpen(true);
+				return;
+			}
+
 			try {
-				const response = await authenticatedFetch(
-					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
-					{ method: "GET" }
-				);
-				if (!response.ok) {
-					const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
-					throw new Error(errorData.detail || "Export failed");
-				}
-
-				const skipped = response.headers.get("X-Skipped-Documents");
-				if (skipped && Number(skipped) > 0) {
-					toast.warning(`${skipped} document(s) were skipped (still processing)`);
-				}
-
-				const blob = await response.blob();
 				const safeName =
 					folder.name
 						.replace(/[^a-zA-Z0-9 _-]/g, "_")
 						.trim()
 						.slice(0, 80) || "folder";
-				const url = URL.createObjectURL(blob);
-				const a = document.createElement("a");
-				a.href = url;
-				a.download = `${safeName}.zip`;
-				document.body.appendChild(a);
-				a.click();
-				document.body.removeChild(a);
-				URL.revokeObjectURL(url);
-
+				await doExport(
+					`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
+					`${safeName}.zip`
+				);
 				toast.success(`Folder "${folder.name}" exported`);
 			} catch (err) {
 				console.error("Folder export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
 			}
 		},
-		[searchSpaceId]
+		[searchSpaceId, pendingDocuments.length, doExport]
 	);
 
 	const handleExportDocument = useCallback(
@@ -1015,6 +1063,33 @@ export function DocumentsSidebar({
 					</AlertDialogFooter>
 				</AlertDialogContent>
 			</AlertDialog>
+
+			<AlertDialog
+				open={exportWarningOpen}
+				onOpenChange={(open) => {
+					if (!open) {
+						setExportWarningOpen(false);
+						setExportWarningContext(null);
+					}
+				}}
+			>
+				<AlertDialogContent>
+					<AlertDialogHeader>
+						<AlertDialogTitle>Some documents are still processing</AlertDialogTitle>
+						<AlertDialogDescription>
+							{exportWarningContext?.pendingCount} document
+							{exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed
+							and will be excluded from the export. Do you want to continue?
+						</AlertDialogDescription>
+					</AlertDialogHeader>
+					<AlertDialogFooter>
+						<AlertDialogCancel>Cancel</AlertDialogCancel>
+						<AlertDialogAction onClick={handleExportWarningConfirm}>
+							Export anyway
+						</AlertDialogAction>
+					</AlertDialogFooter>
+				</AlertDialogContent>
+			</AlertDialog>
 		</>
 	);
 

From b5f6e44fc3a0e34782f51d12b4a9e41b4270e878 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:39:36 +0200
Subject: [PATCH 09/29] security: sanitize folder names in ZIP export paths

---
 surfsense_backend/app/services/export_service.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py
index 49f1a127a..0804e6042 100644
--- a/surfsense_backend/app/services/export_service.py
+++ b/surfsense_backend/app/services/export_service.py
@@ -29,10 +29,11 @@ def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
         if folder_id in cache:
             return cache[folder_id]
         folder = id_to_folder[folder_id]
+        safe_name = _sanitize_filename(folder.name)
         if folder.parent_id is None or folder.parent_id not in id_to_folder:
-            cache[folder_id] = folder.name
+            cache[folder_id] = safe_name
         else:
-            cache[folder_id] = f"{resolve(folder.parent_id)}/{folder.name}"
+            cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}"
         return cache[folder_id]
 
     for f in folders:

From a81fff299ac57e924ee9d6b40fdc87858a3e2a42 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:40:43 +0200
Subject: [PATCH 10/29] fix: scope pending doc warning to folder subtree on
 folder export

---
 .../layout/ui/sidebar/DocumentsSidebar.tsx    | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index ef25d3056..041f03ca7 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -503,13 +503,33 @@ export function DocumentsSidebar({
 		setExportWarningContext(null);
 	}, [exportWarningContext, searchSpaceId, doExport]);
 
+	const getPendingCountInSubtree = useCallback(
+		(folderId: number): number => {
+			const subtreeIds = new Set<number>();
+			function collect(id: number) {
+				subtreeIds.add(id);
+				for (const child of foldersByParent[String(id)] ?? []) {
+					collect(child.id);
+				}
+			}
+			collect(folderId);
+			return treeDocuments.filter(
+				(d) =>
+					subtreeIds.has(d.folderId ?? -1) &&
+					(d.status?.state === "pending" || d.status?.state === "processing")
+			).length;
+		},
+		[foldersByParent, treeDocuments]
+	);
+
 	const handleExportFolder = useCallback(
 		async (folder: FolderDisplay) => {
-			if (pendingDocuments.length > 0) {
+			const folderPendingCount = getPendingCountInSubtree(folder.id);
+			if (folderPendingCount > 0) {
 				setExportWarningContext({
 					type: "folder",
 					folder,
-					pendingCount: pendingDocuments.length,
+					pendingCount: folderPendingCount,
 				});
 				setExportWarningOpen(true);
 				return;
@@ -531,7 +551,7 @@ export function DocumentsSidebar({
 				toast.error(err instanceof Error ? err.message : "Export failed");
 			}
 		},
-		[searchSpaceId, pendingDocuments.length, doExport]
+		[searchSpaceId, getPendingCountInSubtree, doExport]
 	);
 
 	const handleExportDocument = useCallback(

From 7851db792810aebe597a0f1d588f66a98a278753 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:42:57 +0200
Subject: [PATCH 11/29] fix: add i18n keys for folder upload strings

---
 surfsense_web/components/sources/DocumentUploadTab.tsx | 6 +++---
 surfsense_web/messages/en.json                         | 6 +++++-
 surfsense_web/messages/es.json                         | 6 +++++-
 surfsense_web/messages/hi.json                         | 6 +++++-
 surfsense_web/messages/pt.json                         | 6 +++++-
 surfsense_web/messages/zh.json                         | 6 +++++-
 6 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 5fc8e3fd3..0f8ac298d 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -540,7 +540,7 @@ export function DocumentUploadTab({
 						<Upload className="h-10 w-10 text-muted-foreground" />
 						<div className="text-center space-y-1.5">
 							<p className="text-base font-medium">
-								{isElectron ? "Select files or folder" : "Tap to select files or folder"}
+								{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
 							</p>
 							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
 						</div>
@@ -672,7 +672,7 @@ export function DocumentUploadTab({
 					{isAnyUploading && (
 						<div className="space-y-1">
 							<div className="flex items-center justify-between text-xs">
-								<span>{folderUpload ? "Uploading folder…" : t("uploading_files")}</span>
+								<span>{folderUpload ? t("uploading_folder") : t("uploading_files")}</span>
 								<span>{Math.round(uploadProgress)}%</span>
 							</div>
 							<Progress value={uploadProgress} className="h-1.5" />
@@ -702,7 +702,7 @@ export function DocumentUploadTab({
 						) : (
 							<span className="flex items-center gap-2">
 								{folderUpload
-									? `Upload Folder (${fileCount} files)`
+									? t("upload_folder_button", { count: fileCount })
 									: t("upload_button", { count: fileCount })}
 							</span>
 						)}
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json
index a3a4e8853..cef48663f 100644
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@@ -396,7 +396,11 @@
 		"supported_file_types": "Supported File Types",
 		"file_too_large": "File Too Large",
 		"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
-		"no_supported_files_in_folder": "No supported file types found in the selected folder."
+		"no_supported_files_in_folder": "No supported file types found in the selected folder.",
+		"uploading_folder": "Uploading folder…",
+		"upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})",
+		"select_files_or_folder": "Select files or folder",
+		"tap_select_files_or_folder": "Tap to select files or folder"
 	},
 	"add_webpage": {
 		"title": "Add Webpages for Crawling",
diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json
index fa620e271..88154d1fc 100644
--- a/surfsense_web/messages/es.json
+++ b/surfsense_web/messages/es.json
@@ -396,7 +396,11 @@
 		"supported_file_types": "Tipos de archivo soportados",
 		"file_too_large": "Archivo demasiado grande",
 		"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
-		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
+		"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.",
+		"uploading_folder": "Subiendo carpeta…",
+		"upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})",
+		"select_files_or_folder": "Seleccionar archivos o carpeta",
+		"tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta"
 	},
 	"add_webpage": {
 		"title": "Agregar páginas web para rastreo",
diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json
index faeb4cb94..988894714 100644
--- a/surfsense_web/messages/hi.json
+++ b/surfsense_web/messages/hi.json
@@ -396,7 +396,11 @@
 		"supported_file_types": "समर्थित फ़ाइल प्रकार",
 		"file_too_large": "फ़ाइल बहुत बड़ी है",
 		"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
-		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
+		"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।",
+		"uploading_folder": "फ़ोल्डर अपलोड हो रहा है…",
+		"upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})",
+		"select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें",
+		"tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें"
 	},
 	"add_webpage": {
 		"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json
index 0bed7c6cc..b34546da9 100644
--- a/surfsense_web/messages/pt.json
+++ b/surfsense_web/messages/pt.json
@@ -396,7 +396,11 @@
 		"supported_file_types": "Tipos de arquivo suportados",
 		"file_too_large": "Arquivo muito grande",
 		"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
-		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
+		"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.",
+		"uploading_folder": "Enviando pasta…",
+		"upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})",
+		"select_files_or_folder": "Selecionar arquivos ou pasta",
+		"tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta"
 	},
 	"add_webpage": {
 		"title": "Adicionar páginas web para rastreamento",
diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json
index 0d4f7e1c9..a42e59f6f 100644
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@@ -380,7 +380,11 @@
 		"supported_file_types": "支持的文件类型",
 		"file_too_large": "文件过大",
 		"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
-		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
+		"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。",
+		"uploading_folder": "正在上传文件夹…",
+		"upload_folder_button": "上传文件夹（{count}个文件）",
+		"select_files_or_folder": "选择文件或文件夹",
+		"tap_select_files_or_folder": "点击选择文件或文件夹"
 	},
 	"add_webpage": {
 		"title": "添加网页爬取",

From 1af5725bd1759416cbff20a91d9fd98bc775b017 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 13:51:32 +0200
Subject: [PATCH 12/29] fix: track dedup'd filename in used_paths to prevent
 collisions

---
 surfsense_backend/app/services/export_service.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py
index 0804e6042..2d36bfaab 100644
--- a/surfsense_backend/app/services/export_service.py
+++ b/surfsense_backend/app/services/export_service.py
@@ -147,8 +147,7 @@ async def build_export_zip(
                         if dir_path
                         else f"{base_name}_{suffix}.md"
                     )
-                else:
-                    used_paths[file_path] = 1
+                used_paths[file_path] = used_paths.get(file_path, 0) + 1
 
                 zf.writestr(file_path, markdown)
 

From 78fa2d926a94cefdf5bfe25163ba861cf2914074 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 14:00:25 +0200
Subject: [PATCH 13/29] feat: show spinner on export button during export

---
 .../components/documents/DocumentsFilters.tsx     | 15 ++++++++++++---
 .../layout/ui/sidebar/DocumentsSidebar.tsx        |  7 +++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx
index abd65637c..703c9c3b4 100644
--- a/surfsense_web/components/documents/DocumentsFilters.tsx
+++ b/surfsense_web/components/documents/DocumentsFilters.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { Download, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
+import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react";
 import { useTranslations } from "next-intl";
 import React, { useCallback, useMemo, useRef, useState } from "react";
 import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
@@ -21,6 +21,7 @@ export function DocumentsFilters({
 	activeTypes,
 	onCreateFolder,
 	onExportKB,
+	isExporting,
 }: {
 	typeCounts: Partial<Record<DocumentTypeEnum, number>>;
 	onSearch: (v: string) => void;
@@ -29,6 +30,7 @@ export function DocumentsFilters({
 	activeTypes: DocumentTypeEnum[];
 	onCreateFolder?: () => void;
 	onExportKB?: () => void;
+	isExporting?: boolean;
 }) {
 	const t = useTranslations("documents");
 	const id = React.useId();
@@ -91,16 +93,23 @@ export function DocumentsFilters({
 							<TooltipTrigger asChild>
 								<ToggleGroupItem
 									value="export"
+									disabled={isExporting}
 									className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
 									onClick={(e) => {
 										e.preventDefault();
 										onExportKB();
 									}}
 								>
-									<Download size={14} />
+									{isExporting ? (
+										<Loader2 size={14} className="animate-spin" />
+									) : (
+										<Download size={14} />
+									)}
 								</ToggleGroupItem>
 							</TooltipTrigger>
-							<TooltipContent>Export knowledge base</TooltipContent>
+							<TooltipContent>
+								{isExporting ? "Exporting…" : "Export knowledge base"}
+							</TooltipContent>
 						</Tooltip>
 					)}
 
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index 041f03ca7..20b25a2d2 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -484,6 +484,7 @@ export function DocumentsSidebar({
 				setIsExportingKB(false);
 			}
 		} else if (ctx.type === "folder" && ctx.folder) {
+			setIsExportingKB(true);
 			try {
 				const safeName =
 					ctx.folder.name
@@ -498,6 +499,8 @@ export function DocumentsSidebar({
 			} catch (err) {
 				console.error("Folder export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
+			} finally {
+				setIsExportingKB(false);
 			}
 		}
 		setExportWarningContext(null);
@@ -535,6 +538,7 @@ export function DocumentsSidebar({
 				return;
 			}
 
+			setIsExportingKB(true);
 			try {
 				const safeName =
 					folder.name
@@ -549,6 +553,8 @@ export function DocumentsSidebar({
 			} catch (err) {
 				console.error("Folder export failed:", err);
 				toast.error(err instanceof Error ? err.message : "Export failed");
+			} finally {
+				setIsExportingKB(false);
 			}
 		},
 		[searchSpaceId, getPendingCountInSubtree, doExport]
@@ -949,6 +955,7 @@ export function DocumentsSidebar({
 						activeTypes={activeTypes}
 						onCreateFolder={() => handleCreateFolder(null)}
 						onExportKB={handleExportKB}
+						isExporting={isExportingKB}
 					/>
 				</div>
 

From 7e90a8ed3c598de0fe27ef0c007bbd5fa9e38bf0 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 14:33:33 +0200
Subject: [PATCH 14/29] Route uploaded images to vision LLM with
 document-parser fallback

---
 .../app/etl_pipeline/etl_pipeline_service.py  | 25 ++++++++
 .../app/etl_pipeline/file_classifier.py       | 10 ++-
 .../app/etl_pipeline/parsers/vision_llm.py    | 37 +++++++++++
 .../document_processors/file_processors.py    | 11 +++-
 .../app/utils/file_extensions.py              | 23 +++++++
 .../etl_pipeline/test_etl_pipeline_service.py | 61 ++++++++++++++++++-
 .../tests/unit/utils/test_file_extensions.py  | 37 +++++++++++
 7 files changed, 199 insertions(+), 5 deletions(-)
 create mode 100644 surfsense_backend/app/etl_pipeline/parsers/vision_llm.py

diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index fbd2e4e73..5f1495cdb 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
 class EtlPipelineService:
     """Single pipeline for extracting markdown from files. All callers use this."""
 
+    def __init__(self, *, vision_llm=None):
+        self._vision_llm = vision_llm
+
     async def extract(self, request: EtlRequest) -> EtlResult:
         category = classify_file(request.filename)
 
@@ -47,6 +50,28 @@ class EtlPipelineService:
                 content_type="audio",
             )
 
+        if category == FileCategory.IMAGE:
+            return await self._extract_image(request)
+
+        return await self._extract_document(request)
+
+    async def _extract_image(self, request: EtlRequest) -> EtlResult:
+        if self._vision_llm:
+            from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+            content = await parse_with_vision_llm(
+                request.file_path, request.filename, self._vision_llm
+            )
+            return EtlResult(
+                markdown_content=content,
+                etl_service="VISION_LLM",
+                content_type="image",
+            )
+
+        logging.info(
+            "No vision LLM provided, falling back to document parser for %s",
+            request.filename,
+        )
         return await self._extract_document(request)
 
     async def _extract_document(self, request: EtlRequest) -> EtlResult:
diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py
index 4e690bcdc..120369a27 100644
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@@ -3,6 +3,7 @@ from pathlib import PurePosixPath
 
 from app.utils.file_extensions import (
     DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
     get_document_extensions_for_service,
 )
 
@@ -105,6 +106,7 @@ class FileCategory(Enum):
     PLAINTEXT = "plaintext"
     AUDIO = "audio"
     DIRECT_CONVERT = "direct_convert"
+    IMAGE = "image"
     UNSUPPORTED = "unsupported"
     DOCUMENT = "document"
 
@@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
         return FileCategory.AUDIO
     if suffix in DIRECT_CONVERT_EXTENSIONS:
         return FileCategory.DIRECT_CONVERT
+    if suffix in IMAGE_EXTENSIONS:
+        return FileCategory.IMAGE
     if suffix in DOCUMENT_EXTENSIONS:
         return FileCategory.DOCUMENT
     return FileCategory.UNSUPPORTED
@@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
     """Return True if *filename* cannot be processed by *etl_service*.
 
     Plaintext, audio, and direct-convert files are parser-agnostic and never
-    skipped.  Document files are checked against the per-parser extension set.
+    skipped.  Image and document files are checked against the per-parser
+    extension set (images fall back to the document parser when no vision LLM
+    is available, so the same service constraint applies).
     """
     category = classify_file(filename)
     if category == FileCategory.UNSUPPORTED:
         return True
-    if category == FileCategory.DOCUMENT:
+    if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
         suffix = PurePosixPath(filename).suffix.lower()
         return suffix not in get_document_extensions_for_service(etl_service)
     return False
diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
new file mode 100644
index 000000000..e75f81c4b
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -0,0 +1,37 @@
+import base64
+import mimetypes
+
+from langchain_core.messages import HumanMessage
+
+_PROMPT = (
+    "Analyze this image thoroughly and produce a detailed markdown description.\n\n"
+    "Include:\n"
+    "- All visible text, transcribed verbatim\n"
+    "- Description of diagrams, charts, tables, or visual structures\n"
+    "- Key subjects, objects, or scenes depicted\n\n"
+    "Output only the markdown content, no preamble."
+)
+
+
+def _image_to_data_url(file_path: str) -> str:
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if not mime_type or not mime_type.startswith("image/"):
+        mime_type = "image/png"
+    with open(file_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    data_url = _image_to_data_url(file_path)
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": _PROMPT},
+            {"type": "image_url", "image_url": {"url": data_url}},
+        ]
+    )
+    response = await llm.ainvoke([message])
+    text = response.content if hasattr(response, "content") else str(response)
+    if not text or not text.strip():
+        raise ValueError(f"Vision LLM returned empty content for {filename}")
+    return text.strip()
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index c765dbd87..9992231e0 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -333,6 +333,7 @@ async def process_file_in_background(
 async def _extract_file_content(
     file_path: str,
     filename: str,
+    search_space_id: int,
     session: AsyncSession,
     user_id: str,
     task_logger: TaskLoggingService,
@@ -360,6 +361,7 @@ async def _extract_file_content(
             FileCategory.PLAINTEXT: "Reading file",
             FileCategory.DIRECT_CONVERT: "Converting file",
             FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.IMAGE: "Analyzing image",
             FileCategory.UNSUPPORTED: "Unsupported file type",
             FileCategory.DOCUMENT: "Extracting content",
         }
@@ -383,7 +385,13 @@ async def _extract_file_content(
         estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
         await page_limit_service.check_page_limit(user_id, estimated_pages)
 
-    result = await EtlPipelineService().extract(
+    vision_llm = None
+    if category == FileCategory.IMAGE:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(
             file_path=file_path,
             filename=filename,
@@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
         markdown_content, etl_service = await _extract_file_content(
             file_path,
             filename,
+            search_space_id,
             session,
             user_id,
             task_logger,
diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py
index 16ac585b7..e8be1b83a 100644
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
+
+Image extensions intentionally remain in the per-parser sets for fallback
+compatibility.  IMAGE_EXTENSIONS is used only for routing classification.
 """
 
 from pathlib import PurePosixPath
 
+# ---------------------------------------------------------------------------
+# Image extensions (used by file_classifier for routing to vision LLM)
+# ---------------------------------------------------------------------------
+
+IMAGE_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    }
+)
+
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index 9608b011d..4e1d603a3 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
         ("doc.docx", "document"),
         ("slides.pptx", "document"),
         ("sheet.xlsx", "document"),
-        ("photo.png", "document"),
-        ("photo.jpg", "document"),
+        ("photo.png", "image"),
+        ("photo.jpg", "image"),
+        ("photo.webp", "image"),
+        ("photo.gif", "image"),
+        ("photo.heic", "image"),
         ("book.epub", "document"),
         ("letter.odt", "document"),
         ("readme.md", "plaintext"),
@@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(eml_file), filename="mail.eml")
         )
+
+
+# ---------------------------------------------------------------------------
+# Image extraction via vision LLM
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_image_with_vision_llm(tmp_path):
+    """An image file is analyzed by the vision LLM when provided."""
+    from unittest.mock import AsyncMock, MagicMock
+
+    img_file = tmp_path / "photo.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A photo of a sunset over the ocean"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    service = EtlPipelineService(vision_llm=fake_llm)
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="photo.png")
+    )
+
+    assert result.markdown_content == "# A photo of a sunset over the ocean"
+    assert result.etl_service == "VISION_LLM"
+    assert result.content_type == "image"
+    fake_llm.ainvoke.assert_called_once()
+
+
+async def test_extract_image_falls_back_to_document_without_vision_llm(
+    tmp_path, mocker
+):
+    """Without a vision LLM, image files fall back to the document parser."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# OCR text from image"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    img_file = tmp_path / "scan.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    service = EtlPipelineService()
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="scan.png")
+    )
+
+    assert result.markdown_content == "# OCR text from image"
+    assert result.etl_service == "DOCLING"
+    assert result.content_type == "document"
diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py
index 43dfef5f0..ccf5eb70f 100644
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
     )
 
     assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
+
+
+# ---------------------------------------------------------------------------
+# IMAGE_EXTENSIONS
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "ext",
+    [
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    ],
+)
+def test_image_extensions_contains_expected(ext):
+    from app.utils.file_extensions import IMAGE_EXTENSIONS
+
+    assert ext in IMAGE_EXTENSIONS
+
+
+def test_image_extensions_are_subset_of_document_extensions():
+    """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
+    from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
+
+    missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
+    assert not missing, (
+        f"Image extensions missing from document sets (breaks fallback): {missing}"
+    )

From afd3c2cde20106085df0c2f5b8ff168620fd503d Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 14:50:24 +0200
Subject: [PATCH 15/29] Pass vision LLM through local folder indexer call chain

---
 .../local_folder_indexer.py                   | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index f503ff864..f88d313da 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -153,16 +153,16 @@ def scan_folder(
     return files
 
 
-async def _read_file_content(file_path: str, filename: str) -> str:
+async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
     """Read file content via the unified ETL pipeline.
 
-    All file types (plaintext, audio, direct-convert, document) are handled
-    by ``EtlPipelineService``.
+    All file types (plaintext, audio, direct-convert, document, image) are
+    handled by ``EtlPipelineService``.
     """
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
@@ -199,12 +199,14 @@ async def _compute_file_content_hash(
     file_path: str,
     filename: str,
     search_space_id: int,
+    *,
+    vision_llm=None,
 ) -> tuple[str, str]:
     """Read a file (via ETL if needed) and compute its content hash.
 
     Returns (content_text, content_hash).
     """
-    content = await _read_file_content(file_path, filename)
+    content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
     return content, _content_hash(content, search_space_id)
 
 
@@ -635,6 +637,10 @@ async def index_local_folder(
 
         page_limit_service = PageLimitService(session)
 
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
         # ================================================================
         # PHASE 1: Pre-filter files (mtime / content-hash), version changed
         # ================================================================
@@ -704,7 +710,10 @@ async def index_local_folder(
 
                     try:
                         content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
+                            file_info["relative_path"],
+                            search_space_id,
+                            vision_llm=vision_llm,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -738,7 +747,10 @@ async def index_local_folder(
 
                     try:
                         content, content_hash = await _compute_file_content_hash(
-                            file_path_abs, file_info["relative_path"], search_space_id
+                            file_path_abs,
+                            file_info["relative_path"],
+                            search_space_id,
+                            vision_llm=vision_llm,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -1080,9 +1092,13 @@ async def _index_single_file(
         except PageLimitExceededError as e:
             return 0, 1, f"Page limit exceeded: {e}"
 
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
         try:
             content, content_hash = await _compute_file_content_hash(
-                str(full_path), full_path.name, search_space_id
+                str(full_path), full_path.name, search_space_id, vision_llm=vision_llm
             )
         except Exception as e:
             return 0, 1, f"Could not read file: {e}"
@@ -1300,6 +1316,10 @@ async def index_uploaded_files(
         pipeline = IndexingPipelineService(session)
         llm = await get_user_long_context_llm(session, user_id, search_space_id)
 
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
         indexed_count = 0
         failed_count = 0
         errors: list[str] = []
@@ -1347,7 +1367,7 @@ async def index_uploaded_files(
 
                 try:
                     content, content_hash = await _compute_file_content_hash(
-                        temp_path, filename, search_space_id
+                        temp_path, filename, search_space_id, vision_llm=vision_llm
                     )
                 except Exception as e:
                     logger.warning(f"Could not read {relative_path}: {e}")

From caaec2e0a798a500b45a737b1bf7924ed12546db Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 14:56:18 +0200
Subject: [PATCH 16/29] Simplify vision LLM image description prompt

---
 surfsense_backend/app/etl_pipeline/parsers/vision_llm.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
index e75f81c4b..fb12a1e75 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -4,12 +4,9 @@ import mimetypes
 from langchain_core.messages import HumanMessage
 
 _PROMPT = (
-    "Analyze this image thoroughly and produce a detailed markdown description.\n\n"
-    "Include:\n"
-    "- All visible text, transcribed verbatim\n"
-    "- Description of diagrams, charts, tables, or visual structures\n"
-    "- Key subjects, objects, or scenes depicted\n\n"
-    "Output only the markdown content, no preamble."
+    "Describe this image in markdown. "
+    "Transcribe any visible text verbatim. "
+    "Be concise but complete — let the image content guide the level of detail."
 )
 
 

From d6c4fb8938927c6376d196aad6be2bf0d501650f Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 15:11:24 +0200
Subject: [PATCH 17/29] Add try/except fallback in _extract_image for vision
 LLM failures

---
 .../app/etl_pipeline/etl_pipeline_service.py  | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index 5f1495cdb..56ade32fb 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -57,16 +57,23 @@ class EtlPipelineService:
 
     async def _extract_image(self, request: EtlRequest) -> EtlResult:
         if self._vision_llm:
-            from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+            try:
+                from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
 
-            content = await parse_with_vision_llm(
-                request.file_path, request.filename, self._vision_llm
-            )
-            return EtlResult(
-                markdown_content=content,
-                etl_service="VISION_LLM",
-                content_type="image",
-            )
+                content = await parse_with_vision_llm(
+                    request.file_path, request.filename, self._vision_llm
+                )
+                return EtlResult(
+                    markdown_content=content,
+                    etl_service="VISION_LLM",
+                    content_type="image",
+                )
+            except Exception:
+                logging.warning(
+                    "Vision LLM failed for %s, falling back to document parser",
+                    request.filename,
+                    exc_info=True,
+                )
 
         logging.info(
             "No vision LLM provided, falling back to document parser for %s",

From 71db53fc553a50e664d4ec5493960d86d9a52446 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 15:17:08 +0200
Subject: [PATCH 18/29] Add 5MB file size guard before base64 encoding for
 vision LLM

---
 surfsense_backend/app/etl_pipeline/parsers/vision_llm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
index fb12a1e75..bd39de71d 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -1,5 +1,6 @@
 import base64
 import mimetypes
+import os
 
 from langchain_core.messages import HumanMessage
 
@@ -9,8 +10,16 @@ _PROMPT = (
     "Be concise but complete — let the image content guide the level of detail."
 )
 
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024  # 5 MB (Anthropic Claude's limit, the most restrictive)
+
 
 def _image_to_data_url(file_path: str) -> str:
+    file_size = os.path.getsize(file_path)
+    if file_size > _MAX_IMAGE_BYTES:
+        raise ValueError(
+            f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
+            f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
+        )
     mime_type, _ = mimetypes.guess_type(file_path)
     if not mime_type or not mime_type.startswith("image/"):
         mime_type = "image/png"

From 55661bcde68b5b15b034b647ee414fde03aa29ef Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 15:21:32 +0200
Subject: [PATCH 19/29] Replace mimetypes fallback with explicit
 extension-to-MIME mapping

---
 .../app/etl_pipeline/parsers/vision_llm.py    | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
index bd39de71d..d3b778801 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -1,5 +1,4 @@
 import base64
-import mimetypes
 import os
 
 from langchain_core.messages import HumanMessage
@@ -10,7 +9,23 @@ _PROMPT = (
     "Be concise but complete — let the image content guide the level of detail."
 )
 
-_MAX_IMAGE_BYTES = 5 * 1024 * 1024  # 5 MB (Anthropic Claude's limit, the most restrictive)
+_MAX_IMAGE_BYTES = (
+    5 * 1024 * 1024
+)  # 5 MB (Anthropic Claude's limit, the most restrictive)
+
+_EXT_TO_MIME: dict[str, str] = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".bmp": "image/bmp",
+    ".tiff": "image/tiff",
+    ".tif": "image/tiff",
+    ".webp": "image/webp",
+    ".svg": "image/svg+xml",
+    ".heic": "image/heic",
+    ".heif": "image/heif",
+}
 
 
 def _image_to_data_url(file_path: str) -> str:
@@ -20,9 +35,10 @@ def _image_to_data_url(file_path: str) -> str:
             f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
             f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
         )
-    mime_type, _ = mimetypes.guess_type(file_path)
-    if not mime_type or not mime_type.startswith("image/"):
-        mime_type = "image/png"
+    ext = os.path.splitext(file_path)[1].lower()
+    mime_type = _EXT_TO_MIME.get(ext)
+    if not mime_type:
+        raise ValueError(f"Unsupported image extension {ext!r}: {file_path}")
     with open(file_path, "rb") as f:
         encoded = base64.b64encode(f.read()).decode("ascii")
     return f"data:{mime_type};base64,{encoded}"

From ff2a9c77f9eb6c52384c44ddfd344a9645abc49f Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 15:28:21 +0200
Subject: [PATCH 20/29] Pass vision_llm in legacy process_file_in_background
 path

---
 .../tasks/document_processors/file_processors.py   | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 9992231e0..cd06657dc 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -118,9 +118,13 @@ async def _log_page_divergence(
 
 
 async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
-    """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
+    """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
+    from app.etl_pipeline.file_classifier import (
+        FileCategory,
+        classify_file as etl_classify,
+    )
 
     await _notify(ctx, "parsing", "Processing file")
     await ctx.task_logger.log_task_progress(
@@ -129,7 +133,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
         {"processing_stage": "extracting"},
     )
 
-    etl_result = await EtlPipelineService().extract(
+    vision_llm = None
+    if etl_classify(ctx.filename) == FileCategory.IMAGE:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
     )
 

From e164fe061249c78c41d0505f6594df60f75d12c6 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 15:29:39 +0200
Subject: [PATCH 21/29] Fix misleading log when vision LLM fails vs not
 provided

---
 .../app/etl_pipeline/etl_pipeline_service.py             | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index 56ade32fb..2e1a803d8 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -74,11 +74,12 @@ class EtlPipelineService:
                     request.filename,
                     exc_info=True,
                 )
+        else:
+            logging.info(
+                "No vision LLM provided, falling back to document parser for %s",
+                request.filename,
+            )
 
-        logging.info(
-            "No vision LLM provided, falling back to document parser for %s",
-            request.filename,
-        )
         return await self._extract_document(request)
 
     async def _extract_document(self, request: EtlRequest) -> EtlResult:

From 4ccdd80e264b4adb30c142c9d888f8d46c2d3639 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 16:14:53 +0200
Subject: [PATCH 22/29] Harden vision LLM fallback, folder upload validation,
 and export memory

---
 .../app/etl_pipeline/etl_pipeline_service.py  |  9 +++-
 .../app/etl_pipeline/parsers/vision_llm.py    |  7 ++-
 .../app/routes/documents_routes.py            | 24 ++++++++-
 .../app/services/export_service.py            | 54 +++++++++++++++----
 4 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index 2e1a803d8..b4438ce4d 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -80,7 +80,14 @@ class EtlPipelineService:
                 request.filename,
             )
 
-        return await self._extract_document(request)
+        try:
+            return await self._extract_document(request)
+        except (EtlUnsupportedFileError, EtlServiceUnavailableError):
+            raise EtlUnsupportedFileError(
+                f"Cannot process image {request.filename}: vision LLM "
+                f"{'failed' if self._vision_llm else 'not configured'} and "
+                f"document parser does not support this format"
+            ) from None
 
     async def _extract_document(self, request: EtlRequest) -> EtlResult:
         from pathlib import PurePosixPath
diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
index d3b778801..c80fbca0a 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -1,3 +1,4 @@
+import asyncio
 import base64
 import os
 
@@ -13,6 +14,8 @@ _MAX_IMAGE_BYTES = (
     5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
 
+_INVOKE_TIMEOUT_SECONDS = 120
+
 _EXT_TO_MIME: dict[str, str] = {
     ".png": "image/png",
     ".jpg": "image/jpeg",
@@ -52,7 +55,9 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
             {"type": "image_url", "image_url": {"url": data_url}},
         ]
     )
-    response = await llm.ainvoke([message])
+    response = await asyncio.wait_for(
+        llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
+    )
     text = response.content if hasattr(response, "content") else str(response)
     if not text or not text.strip():
         raise ValueError(f"Vision LLM returned empty content for {filename}")
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 8093084f0..25841a107 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -2,7 +2,7 @@
 import asyncio
 
 from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
-from pydantic import BaseModel as PydanticBaseModel
+from pydantic import BaseModel as PydanticBaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.orm import selectinload
@@ -1395,10 +1395,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
     mtime: float
 
 
+_MAX_MTIME_CHECK_FILES = 10_000
+
+
 class FolderMtimeCheckRequest(PydanticBaseModel):
     folder_name: str
     search_space_id: int
-    files: list[FolderMtimeCheckFile]
+    files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
 
 
 class FolderUnlinkRequest(PydanticBaseModel):
@@ -1531,6 +1534,23 @@ async def folder_upload(
                 f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
             )
 
+    from app.services.folder_service import MAX_FOLDER_DEPTH
+
+    max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
+    if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
+            f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
+        )
+
+    if root_folder_id:
+        root_folder = await session.get(Folder, root_folder_id)
+        if not root_folder or root_folder.search_space_id != search_space_id:
+            raise HTTPException(
+                status_code=404, detail="Root folder not found in this search space"
+            )
+
     if not root_folder_id:
         watched_metadata = {
             "watched": True,
diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py
index 2d36bfaab..97f952223 100644
--- a/surfsense_backend/app/services/export_service.py
+++ b/surfsense_backend/app/services/export_service.py
@@ -1,5 +1,6 @@
 """Service for exporting knowledge base content as a ZIP archive."""
 
+import asyncio
 import logging
 import os
 import tempfile
@@ -106,23 +107,38 @@ async def build_export_zip(
 
     folder_path_map = _build_folder_path_map(folders)
 
-    doc_query = select(Document).where(Document.search_space_id == search_space_id)
+    batch_size = 100
+
+    base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
     if target_folder_ids is not None:
-        doc_query = doc_query.where(Document.folder_id.in_(target_folder_ids))
-    doc_result = await session.execute(doc_query)
-    documents = list(doc_result.scalars().all())
+        base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
+    base_doc_query = base_doc_query.order_by(Document.id)
 
     fd, tmp_path = tempfile.mkstemp(suffix=".zip")
     os.close(fd)
 
-    try:
-        used_paths: dict[str, int] = {}
-        skipped_docs: list[str] = []
+    used_paths: dict[str, int] = {}
+    skipped_docs: list[str] = []
+    is_first_batch = True
+
+    try:
+        offset = 0
+        while True:
+            batch_query = base_doc_query.limit(batch_size).offset(offset)
+            batch_result = await session.execute(batch_query)
+            documents = list(batch_result.scalars().all())
+            if not documents:
+                break
+
+            entries: list[tuple[str, str]] = []
 
-        with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zf:
             for doc in documents:
                 status = doc.status or {}
-                state = status.get("state", "ready") if isinstance(status, dict) else "ready"
+                state = (
+                    status.get("state", "ready")
+                    if isinstance(status, dict)
+                    else "ready"
+                )
                 if state in ("pending", "processing"):
                     skipped_docs.append(doc.title or "Untitled")
                     continue
@@ -137,7 +153,9 @@ async def build_export_zip(
                     dir_path = ""
 
                 base_name = _sanitize_filename(doc.title or "Untitled")
-                file_path = f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
+                file_path = (
+                    f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
+                )
 
                 if file_path in used_paths:
                     used_paths[file_path] += 1
@@ -149,7 +167,21 @@ async def build_export_zip(
                     )
                 used_paths[file_path] = used_paths.get(file_path, 0) + 1
 
-                zf.writestr(file_path, markdown)
+                entries.append((file_path, markdown))
+
+            if entries:
+                mode = "w" if is_first_batch else "a"
+                batch_entries = entries
+
+                def _write_batch(m: str = mode, e: list = batch_entries) -> None:
+                    with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
+                        for path, content in e:
+                            zf.writestr(path, content)
+
+                await asyncio.to_thread(_write_batch)
+                is_first_batch = False
+
+            offset += batch_size
 
         export_name = "knowledge-base"
         if folder_id is not None and folder_id in folder_path_map:

From 7e14df6012481b43a0f849910d27120bf60e5998 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 16:40:55 +0200
Subject: [PATCH 23/29] Fix button-in-button hydration error in mobile upload
 drop zone

---
 .../components/sources/DocumentUploadTab.tsx  | 47 +++++++++++--------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 0f8ac298d..117d376ec 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -530,28 +530,35 @@ export function DocumentUploadTab({
 						</button>
 					)
 				) : (
-					<button
-						type="button"
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
-						onClick={() => {
+				<div
+					role="button"
+					tabIndex={0}
+					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
+					onClick={() => {
+						if (!isElectron) fileInputRef.current?.click();
+					}}
+					onKeyDown={(e) => {
+						if (e.key === "Enter" || e.key === " ") {
+							e.preventDefault();
 							if (!isElectron) fileInputRef.current?.click();
-						}}
+						}
+					}}
+				>
+					<Upload className="h-10 w-10 text-muted-foreground" />
+					<div className="text-center space-y-1.5">
+						<p className="text-base font-medium">
+							{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
+						</p>
+						<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
+					</div>
+					<fieldset
+						className="w-full mt-1 border-none p-0 m-0"
+						onClick={(e) => e.stopPropagation()}
+						onKeyDown={(e) => e.stopPropagation()}
 					>
-						<Upload className="h-10 w-10 text-muted-foreground" />
-						<div className="text-center space-y-1.5">
-							<p className="text-base font-medium">
-								{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
-							</p>
-							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
-						</div>
-						<fieldset
-							className="w-full mt-1 border-none p-0 m-0"
-							onClick={(e) => e.stopPropagation()}
-							onKeyDown={(e) => e.stopPropagation()}
-						>
-							{renderBrowseButton({ fullWidth: true })}
-						</fieldset>
-					</button>
+						{renderBrowseButton({ fullWidth: true })}
+					</fieldset>
+				</div>
 				)}
 			</div>
 

From 0aefcbd504c3a4a3539969ea2a69c4522635f23f Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Thu, 9 Apr 2026 22:06:06 +0200
Subject: [PATCH 24/29] Remove vision LLM from desktop folder watcher

---
 .../local_folder_indexer.py                   | 26 ++++---------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index f88d313da..a531916e1 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -153,7 +153,7 @@ def scan_folder(
     return files
 
 
-async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
+async def _read_file_content(file_path: str, filename: str) -> str:
     """Read file content via the unified ETL pipeline.
 
     All file types (plaintext, audio, direct-convert, document, image) are
@@ -162,7 +162,7 @@ async def _read_file_content(file_path: str, filename: str, *, vision_llm=None)
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService(vision_llm=vision_llm).extract(
+    result = await EtlPipelineService().extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
@@ -199,14 +199,12 @@ async def _compute_file_content_hash(
     file_path: str,
     filename: str,
     search_space_id: int,
-    *,
-    vision_llm=None,
 ) -> tuple[str, str]:
     """Read a file (via ETL if needed) and compute its content hash.
 
     Returns (content_text, content_hash).
     """
-    content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
+    content = await _read_file_content(file_path, filename)
     return content, _content_hash(content, search_space_id)
 
 
@@ -637,10 +635,6 @@ async def index_local_folder(
 
         page_limit_service = PageLimitService(session)
 
-        from app.services.llm_service import get_vision_llm
-
-        vision_llm = await get_vision_llm(session, search_space_id)
-
         # ================================================================
         # PHASE 1: Pre-filter files (mtime / content-hash), version changed
         # ================================================================
@@ -713,7 +707,6 @@ async def index_local_folder(
                             file_path_abs,
                             file_info["relative_path"],
                             search_space_id,
-                            vision_llm=vision_llm,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -750,7 +743,6 @@ async def index_local_folder(
                             file_path_abs,
                             file_info["relative_path"],
                             search_space_id,
-                            vision_llm=vision_llm,
                         )
                     except Exception as read_err:
                         logger.warning(f"Could not read {file_path_abs}: {read_err}")
@@ -1092,13 +1084,9 @@ async def _index_single_file(
         except PageLimitExceededError as e:
             return 0, 1, f"Page limit exceeded: {e}"
 
-        from app.services.llm_service import get_vision_llm
-
-        vision_llm = await get_vision_llm(session, search_space_id)
-
         try:
             content, content_hash = await _compute_file_content_hash(
-                str(full_path), full_path.name, search_space_id, vision_llm=vision_llm
+                str(full_path), full_path.name, search_space_id
             )
         except Exception as e:
             return 0, 1, f"Could not read file: {e}"
@@ -1316,10 +1304,6 @@ async def index_uploaded_files(
         pipeline = IndexingPipelineService(session)
         llm = await get_user_long_context_llm(session, user_id, search_space_id)
 
-        from app.services.llm_service import get_vision_llm
-
-        vision_llm = await get_vision_llm(session, search_space_id)
-
         indexed_count = 0
         failed_count = 0
         errors: list[str] = []
@@ -1367,7 +1351,7 @@ async def index_uploaded_files(
 
                 try:
                     content, content_hash = await _compute_file_content_hash(
-                        temp_path, filename, search_space_id, vision_llm=vision_llm
+                        temp_path, filename, search_space_id
                     )
                 except Exception as e:
                     logger.warning(f"Could not read {relative_path}: {e}")

From a95bf58c8f4e5ae593ca0eb49354c9668b8c3a51 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Fri, 10 Apr 2026 16:45:51 +0200
Subject: [PATCH 25/29] Make Vision LLM opt-in for uploads and connectors

---
 ...121_add_enable_vision_llm_to_connectors.py | 45 +++++++++++++++++++
 .../connectors/dropbox/content_extractor.py   |  4 +-
 .../google_drive/content_extractor.py         | 12 +++--
 .../connectors/onedrive/content_extractor.py  | 12 +++--
 surfsense_backend/app/db.py                   |  7 +++
 .../app/routes/documents_routes.py            |  4 ++
 .../app/schemas/search_source_connector.py    |  2 +
 .../app/services/task_dispatcher.py           |  3 ++
 .../app/tasks/celery_tasks/document_tasks.py  |  8 ++++
 .../connector_indexers/dropbox_indexer.py     | 21 ++++++++-
 .../google_drive_indexer.py                   | 34 +++++++++++++-
 .../local_folder_indexer.py                   | 18 ++++++--
 .../connector_indexers/onedrive_indexer.py    | 21 ++++++++-
 .../document_processors/file_processors.py    | 10 ++++-
 .../integration/document_upload/conftest.py   |  2 +
 .../assistant-ui/connector-popup.tsx          |  6 +++
 .../components/vision-llm-config.tsx          | 25 +++++++++++
 .../views/connector-edit-view.tsx             | 13 ++++++
 .../views/indexing-configuration-view.tsx     | 13 ++++++
 .../hooks/use-connector-dialog.ts             | 14 +++++-
 .../components/sources/DocumentUploadTab.tsx  | 13 ++++++
 .../contracts/types/connector.types.ts        |  3 ++
 .../contracts/types/document.types.ts         |  1 +
 .../lib/apis/documents-api.service.ts         |  5 ++-
 24 files changed, 276 insertions(+), 20 deletions(-)
 create mode 100644 surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py
 create mode 100644 surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx

diff --git a/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py b/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py
new file mode 100644
index 000000000..659545645
--- /dev/null
+++ b/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py
@@ -0,0 +1,45 @@
+"""121_add_enable_vision_llm_to_connectors
+
+Revision ID: 121
+Revises: 120
+Create Date: 2026-04-09
+
+Adds enable_vision_llm boolean column to search_source_connectors.
+Defaults to False so vision LLM image processing is opt-in.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "121"
+down_revision: str | None = "120"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    existing_columns = [
+        col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
+    ]
+
+    if "enable_vision_llm" not in existing_columns:
+        op.add_column(
+            "search_source_connectors",
+            sa.Column(
+                "enable_vision_llm",
+                sa.Boolean(),
+                nullable=False,
+                server_default=sa.text("false"),
+            ),
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("search_source_connectors", "enable_vision_llm")
diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py
index 8cbc3e417..372d2fc82 100644
--- a/surfsense_backend/app/connectors/dropbox/content_extractor.py
+++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py
@@ -44,6 +44,8 @@ async def _export_paper_content(
 async def download_and_extract_content(
     client: DropboxClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a Dropbox file and extract its content as markdown.
 
@@ -91,7 +93,7 @@ async def download_and_extract_content(
         from app.etl_pipeline.etl_document import EtlRequest
         from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-        result = await EtlPipelineService().extract(
+        result = await EtlPipelineService(vision_llm=vision_llm).extract(
             EtlRequest(file_path=temp_file_path, filename=file_name)
         )
         markdown = result.markdown_content
diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py
index 83ff32e82..86c789b97 100644
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@@ -27,6 +27,8 @@ logger = logging.getLogger(__name__)
 async def download_and_extract_content(
     client: GoogleDriveClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a Google Drive file and extract its content as markdown.
 
@@ -103,7 +105,9 @@ async def download_and_extract_content(
         etl_filename = (
             file_name + extension if is_google_workspace_file(mime_type) else file_name
         )
-        markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
+        markdown = await _parse_file_to_markdown(
+            temp_file_path, etl_filename, vision_llm=vision_llm
+        )
         return markdown, drive_metadata, None
 
     except Exception as e:
@@ -115,12 +119,14 @@ async def download_and_extract_content(
                 os.unlink(temp_file_path)
 
 
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
+    file_path: str, filename: str, *, vision_llm=None
+) -> str:
     """Parse a local file to markdown using the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py
index 2238b8603..3154f2eca 100644
--- a/surfsense_backend/app/connectors/onedrive/content_extractor.py
+++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py
@@ -16,6 +16,8 @@ logger = logging.getLogger(__name__)
 async def download_and_extract_content(
     client: OneDriveClient,
     file: dict[str, Any],
+    *,
+    vision_llm=None,
 ) -> tuple[str | None, dict[str, Any], str | None]:
     """Download a OneDrive file and extract its content as markdown.
 
@@ -65,7 +67,9 @@ async def download_and_extract_content(
         if error:
             return None, metadata, error
 
-        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
+        markdown = await _parse_file_to_markdown(
+            temp_file_path, file_name, vision_llm=vision_llm
+        )
         return markdown, metadata, None
 
     except Exception as e:
@@ -77,12 +81,14 @@ async def download_and_extract_content(
                 os.unlink(temp_file_path)
 
 
-async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
+async def _parse_file_to_markdown(
+    file_path: str, filename: str, *, vision_llm=None
+) -> str:
     """Parse a local file to markdown using the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 01a6bbda0..cbcb5efa5 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -1555,6 +1555,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
         Boolean, nullable=False, default=False, server_default="false"
     )
 
+    # Vision LLM for image files - disabled by default to save cost/time.
+    # When enabled, images are described via a vision language model instead
+    # of falling back to the document parser.
+    enable_vision_llm = Column(
+        Boolean, nullable=False, default=False, server_default="false"
+    )
+
     # Periodic indexing fields
     periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
     indexing_frequency_minutes = Column(Integer, nullable=True)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 25841a107..aa7f98294 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -123,6 +123,7 @@ async def create_documents_file_upload(
     files: list[UploadFile],
     search_space_id: int = Form(...),
     should_summarize: bool = Form(False),
+    use_vision_llm: bool = Form(False),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
     dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
@@ -272,6 +273,7 @@ async def create_documents_file_upload(
                 search_space_id=search_space_id,
                 user_id=str(user.id),
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
             )
 
         return {
@@ -1490,6 +1492,7 @@ async def folder_upload(
     relative_paths: str = Form(...),
     root_folder_id: int | None = Form(None),
     enable_summary: bool = Form(False),
+    use_vision_llm: bool = Form(False),
     session: AsyncSession = Depends(get_async_session),
     user: User = Depends(current_active_user),
 ):
@@ -1616,6 +1619,7 @@ async def folder_upload(
         folder_name=folder_name,
         root_folder_id=root_folder_id,
         enable_summary=enable_summary,
+        use_vision_llm=use_vision_llm,
         file_mappings=list(file_mappings),
     )
 
diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py
index 1b0ed0b13..aac7b92d5 100644
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel):
     last_indexed_at: datetime | None = None
     config: dict[str, Any]
     enable_summary: bool = False
+    enable_vision_llm: bool = False
     periodic_indexing_enabled: bool = False
     indexing_frequency_minutes: int | None = None
     next_scheduled_at: datetime | None = None
@@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel):
     last_indexed_at: datetime | None = None
     config: dict[str, Any] | None = None
     enable_summary: bool | None = None
+    enable_vision_llm: bool | None = None
     periodic_indexing_enabled: bool | None = None
     indexing_frequency_minutes: int | None = None
     next_scheduled_at: datetime | None = None
diff --git a/surfsense_backend/app/services/task_dispatcher.py b/surfsense_backend/app/services/task_dispatcher.py
index 9a6fc7d63..7bb70b406 100644
--- a/surfsense_backend/app/services/task_dispatcher.py
+++ b/surfsense_backend/app/services/task_dispatcher.py
@@ -19,6 +19,7 @@ class TaskDispatcher(Protocol):
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
     ) -> None: ...
 
 
@@ -34,6 +35,7 @@ class CeleryTaskDispatcher:
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
     ) -> None:
         from app.tasks.celery_tasks.document_tasks import (
             process_file_upload_with_document_task,
@@ -46,6 +48,7 @@ class CeleryTaskDispatcher:
             search_space_id=search_space_id,
             user_id=user_id,
             should_summarize=should_summarize,
+            use_vision_llm=use_vision_llm,
         )
 
 
diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
index 62720826f..fc946b4bc 100644
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@@ -778,6 +778,7 @@ def process_file_upload_with_document_task(
     search_space_id: int,
     user_id: str,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
 ):
     """
     Celery task to process uploaded file with existing pending document.
@@ -833,6 +834,7 @@ def process_file_upload_with_document_task(
                 search_space_id,
                 user_id,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
             )
         )
         logger.info(
@@ -869,6 +871,7 @@ async def _process_file_with_document(
     search_space_id: int,
     user_id: str,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
 ):
     """
     Process file and update existing pending document status.
@@ -971,6 +974,7 @@ async def _process_file_with_document(
                 log_entry=log_entry,
                 notification=notification,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
             )
 
             # Update notification on success
@@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task(
     root_folder_id: int,
     enable_summary: bool,
     file_mappings: list[dict],
+    use_vision_llm: bool = False,
 ):
     """Celery task to index files uploaded from the desktop app."""
     loop = asyncio.new_event_loop()
@@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task(
                 root_folder_id=root_folder_id,
                 enable_summary=enable_summary,
                 file_mappings=file_mappings,
+                use_vision_llm=use_vision_llm,
             )
         )
     finally:
@@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async(
     root_folder_id: int,
     enable_summary: bool,
     file_mappings: list[dict],
+    use_vision_llm: bool = False,
 ):
     """Run upload-based folder indexing with notification + heartbeat."""
     file_count = len(file_mappings)
@@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async(
                 enable_summary=enable_summary,
                 file_mappings=file_mappings,
                 on_heartbeat_callback=_heartbeat_progress,
+                use_vision_llm=use_vision_llm,
             )
 
             if notification:
diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
index 4a49944c2..9f8c1a33a 100644
--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@@ -164,6 +164,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel. Returns (docs, failed_count)."""
     results: list[ConnectorDocument] = []
@@ -176,7 +177,7 @@ async def _download_files_parallel(
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, db_metadata, error = await download_and_extract_content(
-                dropbox_client, file
+                dropbox_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -224,6 +225,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
     connector_docs, download_failed = await _download_files_parallel(
@@ -234,6 +236,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -287,6 +290,7 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int, str]:
     """Delta sync using Dropbox cursor-based change tracking.
 
@@ -359,6 +363,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     indexed = renamed_count + batch_indexed
@@ -384,6 +389,7 @@ async def _index_full_scan(
     incremental_sync: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -469,6 +475,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -498,6 +505,7 @@ async def _index_selected_files(
     enable_summary: bool,
     incremental_sync: bool = True,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
@@ -557,6 +565,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -621,6 +630,13 @@ async def index_dropbox_files(
             return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+
+            vision_llm = await get_vision_llm(session, search_space_id)
+
         dropbox_client = DropboxClient(session, connector_id)
 
         indexing_options = items_dict.get("indexing_options", {})
@@ -650,6 +666,7 @@ async def index_dropbox_files(
                 user_id=user_id,
                 enable_summary=connector_enable_summary,
                 incremental_sync=incremental_sync,
+                vision_llm=vision_llm,
             )
             total_indexed += indexed
             total_skipped += skipped
@@ -684,6 +701,7 @@ async def index_dropbox_files(
                     log_entry,
                     max_files,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 folder_cursors[folder_path] = new_cursor
                 total_unsupported += unsup
@@ -703,6 +721,7 @@ async def index_dropbox_files(
                     include_subfolders,
                     incremental_sync=incremental_sync,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_unsupported += unsup
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index b11087fe6..d8f95da63 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -261,6 +261,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel, returning ConnectorDocuments.
 
@@ -276,7 +277,7 @@ async def _download_files_parallel(
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, drive_metadata, error = await download_and_extract_content(
-                drive_client, file
+                drive_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -322,6 +323,7 @@ async def _process_single_file(
     search_space_id: int,
     user_id: str,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Download, extract, and index a single Drive file via the pipeline.
 
@@ -343,7 +345,7 @@ async def _process_single_file(
         await page_limit_service.check_page_limit(user_id, estimated_pages)
 
         markdown, drive_metadata, error = await download_and_extract_content(
-            drive_client, file
+            drive_client, file, vision_llm=vision_llm
         )
         if error or not markdown:
             logger.warning(f"ETL failed for {file_name}: {error}")
@@ -433,6 +435,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Phase 2+3: parallel download then parallel indexing.
 
@@ -446,6 +449,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -476,6 +480,7 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline.
 
@@ -540,6 +545,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -573,6 +579,7 @@ async def _index_full_scan(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -703,6 +710,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -736,6 +744,7 @@ async def _index_with_delta_sync(
     include_subfolders: bool = False,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Delta sync using change tracking.
 
@@ -844,6 +853,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -947,6 +957,11 @@ async def index_google_drive_files(
                 )
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+            vision_llm = await get_vision_llm(session, search_space_id)
         drive_client = GoogleDriveClient(
             session, connector_id, credentials=pre_built_credentials
         )
@@ -986,6 +1001,7 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
             documents_unsupported += du
             logger.info("Running reconciliation scan after delta sync")
@@ -1004,6 +1020,7 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
             documents_indexed += ri
             documents_skipped += rs
@@ -1029,6 +1046,7 @@ async def index_google_drive_files(
                 include_subfolders,
                 on_heartbeat_callback,
                 connector_enable_summary,
+                vision_llm=vision_llm,
             )
 
         if documents_indexed > 0 or can_use_delta:
@@ -1146,6 +1164,11 @@ async def index_google_drive_single_file(
                 )
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+            vision_llm = await get_vision_llm(session, search_space_id)
         drive_client = GoogleDriveClient(
             session, connector_id, credentials=pre_built_credentials
         )
@@ -1168,6 +1191,7 @@ async def index_google_drive_single_file(
             search_space_id,
             user_id,
             connector_enable_summary,
+            vision_llm=vision_llm,
         )
         await session.commit()
 
@@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files(
                 return 0, 0, [error_msg]
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+            vision_llm = await get_vision_llm(session, search_space_id)
         drive_client = GoogleDriveClient(
             session, connector_id, credentials=pre_built_credentials
         )
@@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files(
             user_id=user_id,
             enable_summary=connector_enable_summary,
             on_heartbeat=on_heartbeat_callback,
+            vision_llm=vision_llm,
         )
 
         if unsupported > 0:
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index a531916e1..2d5f9648d 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -153,7 +153,7 @@ def scan_folder(
     return files
 
 
-async def _read_file_content(file_path: str, filename: str) -> str:
+async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
     """Read file content via the unified ETL pipeline.
 
     All file types (plaintext, audio, direct-convert, document, image) are
@@ -162,7 +162,7 @@ async def _read_file_content(file_path: str, filename: str) -> str:
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
 
-    result = await EtlPipelineService().extract(
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(file_path=file_path, filename=filename)
     )
     return result.markdown_content
@@ -199,12 +199,14 @@ async def _compute_file_content_hash(
     file_path: str,
     filename: str,
     search_space_id: int,
+    *,
+    vision_llm=None,
 ) -> tuple[str, str]:
     """Read a file (via ETL if needed) and compute its content hash.
 
     Returns (content_text, content_hash).
     """
-    content = await _read_file_content(file_path, filename)
+    content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
     return content, _content_hash(content, search_space_id)
 
 
@@ -1268,6 +1270,7 @@ async def index_uploaded_files(
     enable_summary: bool,
     file_mappings: list[dict],
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
+    use_vision_llm: bool = False,
 ) -> tuple[int, int, str | None]:
     """Index files uploaded from the desktop app via temp paths.
 
@@ -1304,6 +1307,12 @@ async def index_uploaded_files(
         pipeline = IndexingPipelineService(session)
         llm = await get_user_long_context_llm(session, user_id, search_space_id)
 
+        vision_llm_instance = None
+        if use_vision_llm:
+            from app.services.llm_service import get_vision_llm
+
+            vision_llm_instance = await get_vision_llm(session, search_space_id)
+
         indexed_count = 0
         failed_count = 0
         errors: list[str] = []
@@ -1351,7 +1360,8 @@ async def index_uploaded_files(
 
                 try:
                     content, content_hash = await _compute_file_content_hash(
-                        temp_path, filename, search_space_id
+                        temp_path, filename, search_space_id,
+                        vision_llm=vision_llm_instance,
                     )
                 except Exception as e:
                     logger.warning(f"Could not read {relative_path}: {e}")
diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
index 06517f542..aa654a9a9 100644
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@@ -171,6 +171,7 @@ async def _download_files_parallel(
     enable_summary: bool,
     max_concurrency: int = 3,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[list[ConnectorDocument], int]:
     """Download and ETL files in parallel. Returns (docs, failed_count)."""
     results: list[ConnectorDocument] = []
@@ -183,7 +184,7 @@ async def _download_files_parallel(
         nonlocal last_heartbeat, completed_count
         async with sem:
             markdown, od_metadata, error = await download_and_extract_content(
-                onedrive_client, file
+                onedrive_client, file, vision_llm=vision_llm
             )
             if error or not markdown:
                 file_name = file.get("name", "Unknown")
@@ -231,6 +232,7 @@ async def _download_and_index(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int]:
     """Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
     connector_docs, download_failed = await _download_files_parallel(
@@ -241,6 +243,7 @@ async def _download_and_index(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     batch_indexed = 0
@@ -293,6 +296,7 @@ async def _index_selected_files(
     user_id: str,
     enable_summary: bool,
     on_heartbeat: HeartbeatCallbackType | None = None,
+    vision_llm=None,
 ) -> tuple[int, int, int, list[str]]:
     """Index user-selected files using the parallel pipeline."""
     page_limit_service = PageLimitService(session)
@@ -343,6 +347,7 @@ async def _index_selected_files(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -375,6 +380,7 @@ async def _index_full_scan(
     include_subfolders: bool = True,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int]:
     """Full scan indexing of a folder.
 
@@ -450,6 +456,7 @@ async def _index_full_scan(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -481,6 +488,7 @@ async def _index_with_delta_sync(
     max_files: int,
     on_heartbeat_callback: HeartbeatCallbackType | None = None,
     enable_summary: bool = True,
+    vision_llm=None,
 ) -> tuple[int, int, int, str | None]:
     """Delta sync using OneDrive change tracking.
 
@@ -573,6 +581,7 @@ async def _index_with_delta_sync(
         user_id=user_id,
         enable_summary=enable_summary,
         on_heartbeat=on_heartbeat_callback,
+        vision_llm=vision_llm,
     )
 
     if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@@ -643,6 +652,12 @@ async def index_onedrive_files(
             return 0, 0, error_msg, 0
 
         connector_enable_summary = getattr(connector, "enable_summary", True)
+        connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
+        vision_llm = None
+        if connector_enable_vision_llm:
+            from app.services.llm_service import get_vision_llm
+            vision_llm = await get_vision_llm(session, search_space_id)
+
         onedrive_client = OneDriveClient(session, connector_id)
 
         indexing_options = items_dict.get("indexing_options", {})
@@ -666,6 +681,7 @@ async def index_onedrive_files(
                 search_space_id=search_space_id,
                 user_id=user_id,
                 enable_summary=connector_enable_summary,
+                vision_llm=vision_llm,
             )
             total_indexed += indexed
             total_skipped += skipped
@@ -695,6 +711,7 @@ async def index_onedrive_files(
                     log_entry,
                     max_files,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += indexed
                 total_skipped += skipped
@@ -721,6 +738,7 @@ async def index_onedrive_files(
                     max_files,
                     include_subfolders,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += ri
                 total_skipped += rs
@@ -740,6 +758,7 @@ async def index_onedrive_files(
                     max_files,
                     include_subfolders,
                     enable_summary=connector_enable_summary,
+                    vision_llm=vision_llm,
                 )
                 total_indexed += indexed
                 total_skipped += skipped
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index cd06657dc..9364fa1cb 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -46,6 +46,7 @@ class _ProcessingContext:
     log_entry: Log
     connector: dict | None = None
     notification: Notification | None = None
+    use_vision_llm: bool = False
     enable_summary: bool = field(init=False)
 
     def __post_init__(self) -> None:
@@ -134,7 +135,7 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
     )
 
     vision_llm = None
-    if etl_classify(ctx.filename) == FileCategory.IMAGE:
+    if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
         from app.services.llm_service import get_vision_llm
 
         vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@@ -288,6 +289,7 @@ async def process_file_in_background(
     log_entry: Log,
     connector: dict | None = None,
     notification: Notification | None = None,
+    use_vision_llm: bool = False,
 ) -> Document | None:
     ctx = _ProcessingContext(
         session=session,
@@ -299,6 +301,7 @@ async def process_file_in_background(
         log_entry=log_entry,
         connector=connector,
         notification=notification,
+        use_vision_llm=use_vision_llm,
     )
 
     try:
@@ -349,6 +352,7 @@ async def _extract_file_content(
     task_logger: TaskLoggingService,
     log_entry: Log,
     notification: Notification | None,
+    use_vision_llm: bool = False,
 ) -> tuple[str, str]:
     """
     Extract markdown content from a file regardless of type.
@@ -396,7 +400,7 @@ async def _extract_file_content(
         await page_limit_service.check_page_limit(user_id, estimated_pages)
 
     vision_llm = None
-    if category == FileCategory.IMAGE:
+    if use_vision_llm and category == FileCategory.IMAGE:
         from app.services.llm_service import get_vision_llm
 
         vision_llm = await get_vision_llm(session, search_space_id)
@@ -435,6 +439,7 @@ async def process_file_in_background_with_document(
     connector: dict | None = None,
     notification: Notification | None = None,
     should_summarize: bool = False,
+    use_vision_llm: bool = False,
 ) -> Document | None:
     """
     Process file and update existing pending document (2-phase pattern).
@@ -463,6 +468,7 @@ async def process_file_in_background_with_document(
             task_logger,
             log_entry,
             notification,
+            use_vision_llm=use_vision_llm,
         )
 
         if not markdown_content:
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index 62f4f6b47..f35d2e605 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -69,6 +69,7 @@ class InlineTaskDispatcher:
         search_space_id: int,
         user_id: str,
         should_summarize: bool = False,
+        use_vision_llm: bool = False,
     ) -> None:
         from app.tasks.celery_tasks.document_tasks import (
             _process_file_with_document,
@@ -82,6 +83,7 @@ class InlineTaskDispatcher:
                 search_space_id,
                 user_id,
                 should_summarize=should_summarize,
+                use_vision_llm=use_vision_llm,
             )
 
 
diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx
index c41e986d4..84361e25b 100644
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			allConnectors,
 			viewingAccountsType,
 			viewingMCPList,
@@ -109,6 +110,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 			setPeriodicEnabled,
 			setFrequencyMinutes,
 			setEnableSummary,
+			setEnableVisionLlm,
 			handleOpenChange,
 			handleTabChange,
 			handleScroll,
@@ -279,6 +281,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
+							enableVisionLlm={enableVisionLlm}
 							isSaving={isSaving}
 							isDisconnecting={isDisconnecting}
 							isIndexing={indexingConnectorIds.has(editingConnector.id)}
@@ -288,6 +291,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
+							onEnableVisionLlmChange={setEnableVisionLlm}
 							onSave={() => {
 								startIndexing(editingConnector.id);
 								handleSaveConnector(() => refreshConnectors());
@@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							periodicEnabled={periodicEnabled}
 							frequencyMinutes={frequencyMinutes}
 							enableSummary={enableSummary}
+							enableVisionLlm={enableVisionLlm}
 							isStartingIndexing={isStartingIndexing}
 							isFromOAuth={isFromOAuth}
 							onStartDateChange={setStartDate}
@@ -343,6 +348,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 							onPeriodicEnabledChange={setPeriodicEnabled}
 							onFrequencyChange={setFrequencyMinutes}
 							onEnableSummaryChange={setEnableSummary}
+							onEnableVisionLlmChange={setEnableVisionLlm}
 							onConfigChange={setIndexingConnectorConfig}
 							onStartIndexing={() => {
 								if (indexingConfig.connectorId) {
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
new file mode 100644
index 000000000..e5ebdbd06
--- /dev/null
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx
@@ -0,0 +1,25 @@
+"use client";
+
+import type { FC } from "react";
+import { Switch } from "@/components/ui/switch";
+
+interface VisionLLMConfigProps {
+	enabled: boolean;
+	onEnabledChange: (enabled: boolean) => void;
+}
+
+export const VisionLLMConfig: FC<VisionLLMConfigProps> = ({ enabled, onEnabledChange }) => {
+	return (
+		<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
+			<div className="flex items-center justify-between">
+				<div className="space-y-1">
+					<h3 className="font-medium text-sm sm:text-base">Enable Vision LLM</h3>
+					<p className="text-xs sm:text-sm text-muted-foreground">
+						Describes images using AI vision (costly, slower)
+					</p>
+				</div>
+				<Switch checked={enabled} onCheckedChange={onEnabledChange} />
+			</div>
+		</div>
+	);
+};
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
index 7308e1e26..bea5d12e8 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx
@@ -15,6 +15,7 @@ import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
+import { VisionLLMConfig } from "../../components/vision-llm-config";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
 import { getConnectorConfigComponent } from "../index";
 
@@ -38,6 +39,7 @@ interface ConnectorEditViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
+	enableVisionLlm: boolean;
 	isSaving: boolean;
 	isDisconnecting: boolean;
 	isIndexing?: boolean;
@@ -47,6 +49,7 @@ interface ConnectorEditViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
+	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onSave: () => void;
 	onDisconnect: () => void;
 	onBack: () => void;
@@ -62,6 +65,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
+	enableVisionLlm,
 	isSaving,
 	isDisconnecting,
 	isIndexing = false,
@@ -71,6 +75,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
+	onEnableVisionLlmChange,
 	onSave,
 	onDisconnect,
 	onBack,
@@ -272,6 +277,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 
+								{/* Vision LLM toggle - only for file-based connectors */}
+								{(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
+									connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
+									connector.connector_type === "DROPBOX_CONNECTOR" ||
+									connector.connector_type === "ONEDRIVE_CONNECTOR") && (
+									<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
+								)}
+
 								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
 								{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
 									connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
index e583cbe17..cb7438cde 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx
@@ -10,6 +10,7 @@ import { cn } from "@/lib/utils";
 import { DateRangeSelector } from "../../components/date-range-selector";
 import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
 import { SummaryConfig } from "../../components/summary-config";
+import { VisionLLMConfig } from "../../components/vision-llm-config";
 import type { IndexingConfigState } from "../../constants/connector-constants";
 import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
 import { getConnectorConfigComponent } from "../index";
@@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps {
 	periodicEnabled: boolean;
 	frequencyMinutes: string;
 	enableSummary: boolean;
+	enableVisionLlm: boolean;
 	isStartingIndexing: boolean;
 	isFromOAuth?: boolean;
 	onStartDateChange: (date: Date | undefined) => void;
@@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps {
 	onPeriodicEnabledChange: (enabled: boolean) => void;
 	onFrequencyChange: (frequency: string) => void;
 	onEnableSummaryChange: (enabled: boolean) => void;
+	onEnableVisionLlmChange: (enabled: boolean) => void;
 	onConfigChange?: (config: Record<string, unknown>) => void;
 	onStartIndexing: () => void;
 	onSkip: () => void;
@@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	periodicEnabled,
 	frequencyMinutes,
 	enableSummary,
+	enableVisionLlm,
 	isStartingIndexing,
 	isFromOAuth = false,
 	onStartDateChange,
@@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 	onPeriodicEnabledChange,
 	onFrequencyChange,
 	onEnableSummaryChange,
+	onEnableVisionLlmChange,
 	onConfigChange,
 	onStartIndexing,
 	onSkip,
@@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
 								{/* AI Summary toggle */}
 								<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
 
+								{/* Vision LLM toggle - only for file-based connectors */}
+								{(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" ||
+									config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
+									config.connectorType === "DROPBOX_CONNECTOR" ||
+									config.connectorType === "ONEDRIVE_CONNECTOR") && (
+									<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
+								)}
+
 								{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
 								{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
 									config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
index 6543bbd72..7331549b5 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
+++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts
@@ -80,6 +80,7 @@ export const useConnectorDialog = () => {
 	const [periodicEnabled, setPeriodicEnabled] = useState(false);
 	const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
 	const [enableSummary, setEnableSummary] = useState(false);
+	const [enableVisionLlm, setEnableVisionLlm] = useState(false);
 
 	// Edit mode state
 	const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
@@ -621,6 +622,7 @@ export const useConnectorDialog = () => {
 									setPeriodicEnabled(false);
 									setFrequencyMinutes("1440");
 									setEnableSummary(connector.enable_summary ?? false);
+									setEnableVisionLlm(connector.enable_vision_llm ?? false);
 									setStartDate(undefined);
 									setEndDate(undefined);
 
@@ -763,12 +765,13 @@ export const useConnectorDialog = () => {
 				const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
 
 				// Update connector with summary, periodic sync settings, and config changes
-				if (enableSummary || periodicEnabled || indexingConnectorConfig) {
-					const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
+			if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) {
+				const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
 					await updateConnector({
 						id: indexingConfig.connectorId,
 						data: {
 							enable_summary: enableSummary,
+							enable_vision_llm: enableVisionLlm,
 							...(periodicEnabled && {
 								periodic_indexing_enabled: true,
 								indexing_frequency_minutes: frequency,
@@ -896,6 +899,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			indexingConnectorConfig,
 			setIsOpen,
 		]
@@ -960,6 +964,7 @@ export const useConnectorDialog = () => {
 			setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
 			setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
 			setEnableSummary(connector.enable_summary ?? false);
+			setEnableVisionLlm(connector.enable_vision_llm ?? false);
 			setStartDate(undefined);
 			setEndDate(undefined);
 		},
@@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => {
 					data: {
 						name: connectorName || editingConnector.name,
 						enable_summary: enableSummary,
+						enable_vision_llm: enableVisionLlm,
 						periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
 						indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
 						config: connectorConfig || editingConnector.config,
@@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => {
 			periodicEnabled,
 			frequencyMinutes,
 			enableSummary,
+			enableVisionLlm,
 			getFrequencyLabel,
 			connectorConfig,
 			connectorName,
@@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => {
 					setPeriodicEnabled(false);
 					setFrequencyMinutes("1440");
 					setEnableSummary(false);
+					setEnableVisionLlm(false);
 				}
 			}
 		},
@@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => {
 		periodicEnabled,
 		frequencyMinutes,
 		enableSummary,
+		enableVisionLlm,
 		searchSpaceId,
 		allConnectors,
 		viewingAccountsType,
@@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => {
 		setPeriodicEnabled,
 		setFrequencyMinutes,
 		setEnableSummary,
+		setEnableVisionLlm,
 		setConnectorName,
 
 		// Handlers
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 117d376ec..e7f4451b8 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -136,6 +136,7 @@ export function DocumentUploadTab({
 	const [uploadProgress, setUploadProgress] = useState(0);
 	const [accordionValue, setAccordionValue] = useState<string>("");
 	const [shouldSummarize, setShouldSummarize] = useState(false);
+	const [useVisionLlm, setUseVisionLlm] = useState(false);
 	const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
 	const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
 	const fileInputRef = useRef<HTMLInputElement>(null);
@@ -361,6 +362,7 @@ export function DocumentUploadTab({
 						relative_paths: batch.map((e) => e.relativePath),
 						root_folder_id: rootFolderId,
 						enable_summary: shouldSummarize,
+						use_vision_llm: useVisionLlm,
 					}
 				);
 
@@ -407,6 +409,7 @@ export function DocumentUploadTab({
 				files: rawFiles,
 				search_space_id: Number(searchSpaceId),
 				should_summarize: shouldSummarize,
+				use_vision_llm: useVisionLlm,
 			},
 			{
 				onSuccess: () => {
@@ -696,6 +699,16 @@ export function DocumentUploadTab({
 						<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
 					</div>
 
+					<div className={toggleRowClass}>
+						<div className="space-y-0.5">
+							<p className="font-medium text-sm">Enable Vision LLM</p>
+							<p className="text-xs text-muted-foreground">
+								Describes images using AI vision (costly, slower)
+							</p>
+						</div>
+						<Switch checked={useVisionLlm} onCheckedChange={setUseVisionLlm} />
+					</div>
+
 					<Button
 						className="w-full"
 						onClick={handleUpload}
diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts
index b83e05dcc..61d5ffc94 100644
--- a/surfsense_web/contracts/types/connector.types.ts
+++ b/surfsense_web/contracts/types/connector.types.ts
@@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({
 	last_indexed_at: z.string().nullable(),
 	config: z.record(z.string(), z.any()),
 	enable_summary: z.boolean().default(false),
+	enable_vision_llm: z.boolean().default(false),
 	periodic_indexing_enabled: z.boolean(),
 	indexing_frequency_minutes: z.number().nullable(),
 	next_scheduled_at: z.string().nullable(),
@@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({
 		last_indexed_at: true,
 		config: true,
 		enable_summary: true,
+		enable_vision_llm: true,
 		periodic_indexing_enabled: true,
 		indexing_frequency_minutes: true,
 		next_scheduled_at: true,
@@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({
 			last_indexed_at: true,
 			config: true,
 			enable_summary: true,
+			enable_vision_llm: true,
 			periodic_indexing_enabled: true,
 			indexing_frequency_minutes: true,
 			next_scheduled_at: true,
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index a8a1a11d4..13175267c 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -148,6 +148,7 @@ export const uploadDocumentRequest = z.object({
 	files: z.array(z.instanceof(File)),
 	search_space_id: z.number(),
 	should_summarize: z.boolean().default(false),
+	use_vision_llm: z.boolean().default(false),
 });
 
 export const uploadDocumentResponse = z.object({
diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts
index 584f2e212..407d8b644 100644
--- a/surfsense_web/lib/apis/documents-api.service.ts
+++ b/surfsense_web/lib/apis/documents-api.service.ts
@@ -127,7 +127,7 @@ class DocumentsApiService {
 			throw new ValidationError(`Invalid request: ${errorMessage}`);
 		}
 
-		const { files, search_space_id, should_summarize } = parsedRequest.data;
+		const { files, search_space_id, should_summarize, use_vision_llm } = parsedRequest.data;
 		const UPLOAD_BATCH_SIZE = 5;
 
 		const batches: File[][] = [];
@@ -146,6 +146,7 @@ class DocumentsApiService {
 			for (const file of batch) formData.append("files", file);
 			formData.append("search_space_id", String(search_space_id));
 			formData.append("should_summarize", String(should_summarize));
+			formData.append("use_vision_llm", String(use_vision_llm));
 
 			const controller = new AbortController();
 			const timeoutId = setTimeout(() => controller.abort(), 120_000);
@@ -442,6 +443,7 @@ class DocumentsApiService {
 			relative_paths: string[];
 			root_folder_id?: number | null;
 			enable_summary?: boolean;
+			use_vision_llm?: boolean;
 		},
 		signal?: AbortSignal
 	): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
@@ -456,6 +458,7 @@ class DocumentsApiService {
 			formData.append("root_folder_id", String(metadata.root_folder_id));
 		}
 		formData.append("enable_summary", String(metadata.enable_summary ?? false));
+		formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false));
 
 		const totalSize = files.reduce((acc, f) => acc + f.size, 0);
 		const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);

From f556446d070fad25102859837f5f57e078c0098d Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Fri, 10 Apr 2026 18:20:49 +0200
Subject: [PATCH 26/29] Fix test mocks for vision_llm kwarg

---
 .../tests/unit/connector_indexers/test_dropbox_parallel.py    | 4 ++--
 .../unit/connector_indexers/test_google_drive_parallel.py     | 4 ++--
 .../tests/unit/connector_indexers/test_onedrive_parallel.py   | 4 ++--
 .../tests/unit/etl_pipeline/test_etl_pipeline_service.py      | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
index f72135d05..9ba87207a 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py
@@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
index 0ae096361..7e968514c 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py
@@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
index 12a912b03..396d79e73 100644
--- a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
+++ b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py
@@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
     active = 0
     peak = 0
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         nonlocal active, peak
         async with lock:
             active += 1
@@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
 
     monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
 
-    async def _slow_extract(client, file):
+    async def _slow_extract(client, file, **kwargs):
         await asyncio.sleep(0.05)
         return _mock_extract_ok(file["id"], file["name"])
 
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index 4e1d603a3..1a94d4263 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -431,7 +431,7 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
     mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
     mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
 
-    with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
+    with pytest.raises(EtlUnsupportedFileError, match="document parser does not support this format"):
         await EtlPipelineService().extract(
             EtlRequest(file_path=str(heif_file), filename="photo.heif")
         )

From 10ef2a81aba953a9a39be7718458fa3319b651f4 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Sat, 11 Apr 2026 10:45:01 +0200
Subject: [PATCH 27/29] chore: add .env to desktop gitignore

---
 surfsense_desktop/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/surfsense_desktop/.gitignore b/surfsense_desktop/.gitignore
index 4bff253bb..70e5f15b9 100644
--- a/surfsense_desktop/.gitignore
+++ b/surfsense_desktop/.gitignore
@@ -1,3 +1,4 @@
 node_modules/
 dist/
 release/
+.env
\ No newline at end of file

From 51c2633c30a962c9099bcac80530a0eb862cd7ec Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Sat, 11 Apr 2026 10:46:39 +0200
Subject: [PATCH 28/29] chore: add .env.example for desktop

---
 surfsense_desktop/.env.example | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 surfsense_desktop/.env.example

diff --git a/surfsense_desktop/.env.example b/surfsense_desktop/.env.example
new file mode 100644
index 000000000..e127b99e0
--- /dev/null
+++ b/surfsense_desktop/.env.example
@@ -0,0 +1,10 @@
+# Electron-specific build-time configuration.
+# Set before running pnpm dist:mac / dist:win / dist:linux.
+
+# The hosted web frontend URL. Used to intercept OAuth redirects and keep them
+# inside the desktop app. Set to your production frontend domain.
+HOSTED_FRONTEND_URL=https://surfsense.net
+
+# PostHog analytics (leave empty to disable)
+POSTHOG_KEY=
+POSTHOG_HOST=https://assets.surfsense.com

From ec3bd2f5f44a1a212c791f0ff00306aaaab27348 Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Sat, 11 Apr 2026 11:02:44 +0200
Subject: [PATCH 29/29] fix: renumber vision_llm migration from 121 to 123 to
 resolve conflict with upstream

---
 ...s.py => 123_add_enable_vision_llm_to_connectors.py} | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
 rename surfsense_backend/alembic/versions/{121_add_enable_vision_llm_to_connectors.py => 123_add_enable_vision_llm_to_connectors.py} (88%)

diff --git a/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
similarity index 88%
rename from surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py
rename to surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
index 659545645..353e0680e 100644
--- a/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py
+++ b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py
@@ -1,7 +1,7 @@
-"""121_add_enable_vision_llm_to_connectors
+"""123_add_enable_vision_llm_to_connectors
 
-Revision ID: 121
-Revises: 120
+Revision ID: 123
+Revises: 122
 Create Date: 2026-04-09
 
 Adds enable_vision_llm boolean column to search_source_connectors.
@@ -17,8 +17,8 @@ import sqlalchemy as sa
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = "121"
-down_revision: str | None = "120"
+revision: str = "123"
+down_revision: str | None = "122"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None