From 57ae2bd5afe085274b5fd6993b9190b992fc7a2e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 11:18:56 +0200 Subject: [PATCH 01/29] feat: preserve folder structure on web folder upload --- .../components/sources/DocumentUploadTab.tsx | 312 +++++++++++++++--- 1 file changed, 270 insertions(+), 42 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 124354a49..5fc8e3fd3 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress"; import { Spinner } from "@/components/ui/spinner"; import { Switch } from "@/components/ui/switch"; import { useElectronAPI } from "@/hooks/use-platform"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; import { trackDocumentUploadFailure, trackDocumentUploadStarted, @@ -48,6 +49,77 @@ interface FileWithId { file: File; } +interface FolderEntry { + id: string; + file: File; + relativePath: string; +} + +interface FolderUploadData { + folderName: string; + entries: FolderEntry[]; +} + +interface FolderTreeNode { + name: string; + isFolder: boolean; + size?: number; + children: FolderTreeNode[]; +} + +function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] { + const root: FolderTreeNode = { name: "", isFolder: true, children: [] }; + + for (const entry of entries) { + const parts = entry.relativePath.split("/"); + let current = root; + + for (let i = 0; i < parts.length - 1; i++) { + let child = current.children.find((c) => c.name === parts[i] && c.isFolder); + if (!child) { + child = { name: parts[i], isFolder: true, children: [] }; + current.children.push(child); + } + current = child; + } + + current.children.push({ + name: parts[parts.length - 1], + isFolder: false, + size: entry.file.size, + children: [], + }); + } + + function sortNodes(node: FolderTreeNode) { + node.children.sort((a, b) => { + if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1; + return a.name.localeCompare(b.name); + }); + for (const child of node.children) sortNodes(child); + } + sortNodes(root); + + return root.children; +} + +function flattenTree( + nodes: FolderTreeNode[], + depth = 0 +): { name: string; isFolder: boolean; depth: number; size?: number }[] { + const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = []; + for (const node of nodes) { + items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size }); + if (node.isFolder && node.children.length > 0) { + items.push(...flattenTree(node.children, depth + 1)); + } + } + return items; +} + +const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024; +const FOLDER_BATCH_MAX_FILES = 10; + const MAX_FILE_SIZE_MB = 500; const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; @@ -69,6 +141,8 @@ export function DocumentUploadTab({ const fileInputRef = useRef(null); const folderInputRef = useRef(null); const progressIntervalRef = useRef | null>(null); + const [folderUpload, setFolderUpload] = useState(null); + const [isFolderUploading, setIsFolderUploading] = useState(false); useEffect(() => { return () => { @@ -105,6 +179,7 @@ export function DocumentUploadTab({ const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES); if (valid.length === 0) return; + setFolderUpload(null); setFiles((prev) => { const newEntries = valid.map((f) => ({ id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, @@ -159,6 +234,7 @@ export function DocumentUploadTab({ file: new File([fd.data], fd.name, { type: fd.mimeType }), }) ); + setFolderUpload(null); setFiles((prev) => [...prev, ...newFiles]); }, [electronAPI, supportedExtensionsSet, t]); @@ -167,18 +243,35 @@ export function DocumentUploadTab({ const fileList = e.target.files; if (!fileList || fileList.length === 0) return; - const folderFiles = Array.from(fileList).filter((f) => { - const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; - return ext !== "" && supportedExtensionsSet.has(ext); - }); + const allFiles = Array.from(fileList); + const firstPath = allFiles[0]?.webkitRelativePath || ""; + const folderName = firstPath.split("/")[0]; - if (folderFiles.length === 0) { + if (!folderName) { + addFiles(allFiles); + e.target.value = ""; + return; + } + + const entries: FolderEntry[] = allFiles + .filter((f) => { + const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; + return ext !== "" && supportedExtensionsSet.has(ext); + }) + .map((f) => ({ + id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, + file: f, + relativePath: f.webkitRelativePath.substring(folderName.length + 1), + })); + + if (entries.length === 0) { toast.error(t("no_supported_files_in_folder")); e.target.value = ""; return; } - addFiles(folderFiles); + setFiles([]); + setFolderUpload({ folderName, entries }); e.target.value = ""; }, [addFiles, supportedExtensionsSet, t] @@ -192,9 +285,18 @@ export function DocumentUploadTab({ return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`; }; - const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0); + const totalFileSize = folderUpload + ? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0) + : files.reduce((total, entry) => total + entry.file.size, 0); - const hasContent = files.length > 0; + const fileCount = folderUpload ? folderUpload.entries.length : files.length; + const hasContent = files.length > 0 || folderUpload !== null; + const isAnyUploading = isUploading || isFolderUploading; + + const folderTreeItems = useMemo(() => { + if (!folderUpload) return []; + return flattenTree(buildFolderTree(folderUpload.entries)); + }, [folderUpload]); const handleAccordionChange = useCallback( (value: string) => { @@ -204,7 +306,94 @@ export function DocumentUploadTab({ [onAccordionStateChange] ); + const handleFolderUpload = async () => { + if (!folderUpload) return; + + setUploadProgress(0); + setIsFolderUploading(true); + const total = folderUpload.entries.length; + trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize); + + try { + const batches: FolderEntry[][] = []; + let currentBatch: FolderEntry[] = []; + let currentSize = 0; + + for (const entry of folderUpload.entries) { + const size = entry.file.size; + + if (size >= FOLDER_BATCH_SIZE_BYTES) { + if (currentBatch.length > 0) { + batches.push(currentBatch); + currentBatch = []; + currentSize = 0; + } + batches.push([entry]); + continue; + } + + if ( + currentBatch.length >= FOLDER_BATCH_MAX_FILES || + currentSize + size > FOLDER_BATCH_SIZE_BYTES + ) { + batches.push(currentBatch); + currentBatch = []; + currentSize = 0; + } + + currentBatch.push(entry); + currentSize += size; + } + + if (currentBatch.length > 0) { + batches.push(currentBatch); + } + + let rootFolderId: number | null = null; + let uploaded = 0; + + for (const batch of batches) { + const result = await documentsApiService.folderUploadFiles( + batch.map((e) => e.file), + { + folder_name: folderUpload.folderName, + search_space_id: Number(searchSpaceId), + relative_paths: batch.map((e) => e.relativePath), + root_folder_id: rootFolderId, + enable_summary: shouldSummarize, + } + ); + + if (result.root_folder_id && !rootFolderId) { + rootFolderId = result.root_folder_id; + } + + uploaded += batch.length; + setUploadProgress(Math.round((uploaded / total) * 100)); + } + + trackDocumentUploadSuccess(Number(searchSpaceId), total); + toast(t("upload_initiated"), { description: t("upload_initiated_desc") }); + setFolderUpload(null); + onSuccess?.(); + } catch (error) { + const message = error instanceof Error ? error.message : "Upload failed"; + trackDocumentUploadFailure(Number(searchSpaceId), message); + toast(t("upload_error"), { + description: `${t("upload_error_desc")}: ${message}`, + }); + } finally { + setIsFolderUploading(false); + setUploadProgress(0); + } + }; + const handleUpload = async () => { + if (folderUpload) { + await handleFolderUpload(); + return; + } + setUploadProgress(0); trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize); @@ -398,55 +587,92 @@ export function DocumentUploadTab({ {/* FILES SELECTED */} - {files.length > 0 && ( + {hasContent && (

- {t("selected_files", { count: files.length })} - - {formatFileSize(totalFileSize)} + {folderUpload ? ( + <> + + {folderUpload.folderName} + + {folderUpload.entries.length}{" "} + {folderUpload.entries.length === 1 ? "file" : "files"} + + {formatFileSize(totalFileSize)} + + ) : ( + <> + {t("selected_files", { count: files.length })} + + {formatFileSize(totalFileSize)} + + )}

- {files.map((entry) => ( -
- - {entry.file.name.split(".").pop() || "?"} - - {entry.file.name} - - {formatFileSize(entry.file.size)} - - -
- ))} + {folderUpload + ? folderTreeItems.map((item, i) => ( +
+ {item.isFolder ? ( + + ) : ( + + )} + {item.name} + {!item.isFolder && item.size != null && ( + + {formatFileSize(item.size)} + + )} +
+ )) + : files.map((entry) => ( +
+ + {entry.file.name.split(".").pop() || "?"} + + {entry.file.name} + + {formatFileSize(entry.file.size)} + + +
+ ))}
- {isUploading && ( + {isAnyUploading && (
- {t("uploading_files")} + {folderUpload ? "Uploading folder…" : t("uploading_files")} {Math.round(uploadProgress)}%
@@ -466,16 +692,18 @@ export function DocumentUploadTab({ From b1fa1279b19886e2a012195fe9195b12c6745cc4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 12:09:34 +0200 Subject: [PATCH 02/29] feat: add export KB button in documents toolbar --- .../components/documents/DocumentsFilters.tsx | 22 ++++++++++++++++++- .../layout/ui/sidebar/DocumentsSidebar.tsx | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx index a795b61c7..abd65637c 100644 --- a/surfsense_web/components/documents/DocumentsFilters.tsx +++ b/surfsense_web/components/documents/DocumentsFilters.tsx @@ -1,6 +1,6 @@ "use client"; -import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; +import { Download, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import React, { useCallback, useMemo, useRef, useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; @@ -20,6 +20,7 @@ export function DocumentsFilters({ onToggleType, activeTypes, onCreateFolder, + onExportKB, }: { typeCounts: Partial>; onSearch: (v: string) => void; @@ -27,6 +28,7 @@ export function DocumentsFilters({ onToggleType: (type: DocumentTypeEnum, checked: boolean) => void; activeTypes: DocumentTypeEnum[]; onCreateFolder?: () => void; + onExportKB?: () => void; }) { const t = useTranslations("documents"); const id = React.useId(); @@ -84,6 +86,24 @@ export function DocumentsFilters({ )} + {onExportKB && ( + + + { + e.preventDefault(); + onExportKB(); + }} + > + + + + Export knowledge base + + )} + diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 8b3a119ae..db80c8d8d 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -800,6 +800,7 @@ export function DocumentsSidebar({ onToggleType={onToggleType} activeTypes={activeTypes} onCreateFolder={() => handleCreateFolder(null)} + onExportKB={() => toast("Export KB clicked (placeholder)")} />
From 473eece89680bc740fcafc1c9d491402a5f5cfb9 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 12:10:37 +0200 Subject: [PATCH 03/29] feat: add export route skeleton --- surfsense_backend/app/routes/__init__.py | 2 + surfsense_backend/app/routes/export_routes.py | 38 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 surfsense_backend/app/routes/export_routes.py diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 02367606b..443b8cc93 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router from .documents_routes import router as documents_router from .dropbox_add_connector_route import router as dropbox_add_connector_router from .editor_routes import router as editor_router +from .export_routes import router as export_router from .folders_routes import router as folders_router from .google_calendar_add_connector_route import ( router as google_calendar_add_connector_router, @@ -57,6 +58,7 @@ router = APIRouter() router.include_router(search_spaces_router) router.include_router(rbac_router) # RBAC routes for roles, members, invites router.include_router(editor_router) +router.include_router(export_router) router.include_router(documents_router) router.include_router(folders_router) router.include_router(notes_router) diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py new file mode 100644 index 000000000..0bc5b4d1c --- /dev/null +++ b/surfsense_backend/app/routes/export_routes.py @@ -0,0 +1,38 @@ +"""Routes for exporting knowledge base content as ZIP.""" + +import logging + +from fastapi import APIRouter, Depends, Query +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Permission, User, get_async_session +from app.users import current_active_user +from app.utils.rbac import check_permission + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.get("/search-spaces/{search_space_id}/export") +async def export_knowledge_base( + search_space_id: int, + folder_id: int | None = Query(None, description="Export only this folder's subtree"), + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Export documents as a ZIP of markdown files preserving folder structure. + + If folder_id is provided, only that folder's subtree is exported. + Otherwise, the entire search space is exported. + """ + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to export documents in this search space", + ) + + # TODO: implement export logic + return {"message": "Export endpoint placeholder"} From 47f1d7e37359d437c12e55843211888dfb6c2327 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 12:17:43 +0200 Subject: [PATCH 04/29] feat: implement KB export as ZIP with service layer --- surfsense_backend/app/routes/export_routes.py | 39 +++- .../app/services/export_service.py | 168 ++++++++++++++++++ 2 files changed, 199 insertions(+), 8 deletions(-) create mode 100644 surfsense_backend/app/services/export_service.py diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py index 0bc5b4d1c..641c7fedb 100644 --- a/surfsense_backend/app/routes/export_routes.py +++ b/surfsense_backend/app/routes/export_routes.py @@ -1,11 +1,14 @@ """Routes for exporting knowledge base content as ZIP.""" import logging +import os -from fastapi import APIRouter, Depends, Query +from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi.responses import StreamingResponse from sqlalchemy.ext.asyncio import AsyncSession from app.db import Permission, User, get_async_session +from app.services.export_service import build_export_zip from app.users import current_active_user from app.utils.rbac import check_permission @@ -21,11 +24,7 @@ async def export_knowledge_base( session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): - """Export documents as a ZIP of markdown files preserving folder structure. - - If folder_id is provided, only that folder's subtree is exported. - Otherwise, the entire search space is exported. - """ + """Export documents as a ZIP of markdown files preserving folder structure.""" await check_permission( session, user, @@ -34,5 +33,29 @@ async def export_knowledge_base( "You don't have permission to export documents in this search space", ) - # TODO: implement export logic - return {"message": "Export endpoint placeholder"} + try: + result = await build_export_zip(session, search_space_id, folder_id) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) from None + + def stream_and_cleanup(): + try: + with open(result.zip_path, "rb") as f: + while chunk := f.read(8192): + yield chunk + finally: + os.unlink(result.zip_path) + + headers = { + "Content-Disposition": f'attachment; filename="{result.export_name}.zip"', + "Content-Length": str(result.zip_size), + } + + if result.skipped_docs: + headers["X-Skipped-Documents"] = str(len(result.skipped_docs)) + + return StreamingResponse( + stream_and_cleanup(), + media_type="application/zip", + headers=headers, + ) diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py new file mode 100644 index 000000000..49f1a127a --- /dev/null +++ b/surfsense_backend/app/services/export_service.py @@ -0,0 +1,168 @@ +"""Service for exporting knowledge base content as a ZIP archive.""" + +import logging +import os +import tempfile +import zipfile +from dataclasses import dataclass, field + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select + +from app.db import Chunk, Document, Folder +from app.services.folder_service import get_folder_subtree_ids + +logger = logging.getLogger(__name__) + + +def _sanitize_filename(title: str) -> str: + safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip() + return safe[:80] or "document" + + +def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]: + """Build a mapping of folder_id -> full path string (e.g. 'Research/AI').""" + id_to_folder = {f.id: f for f in folders} + cache: dict[int, str] = {} + + def resolve(folder_id: int) -> str: + if folder_id in cache: + return cache[folder_id] + folder = id_to_folder[folder_id] + if folder.parent_id is None or folder.parent_id not in id_to_folder: + cache[folder_id] = folder.name + else: + cache[folder_id] = f"{resolve(folder.parent_id)}/{folder.name}" + return cache[folder_id] + + for f in folders: + resolve(f.id) + + return cache + + +async def _get_document_markdown( + session: AsyncSession, document: Document +) -> str | None: + """Resolve markdown content using the 3-tier fallback: + 1. source_markdown 2. blocknote_document conversion 3. chunk concatenation + """ + if document.source_markdown is not None: + return document.source_markdown + + if document.blocknote_document: + from app.utils.blocknote_to_markdown import blocknote_to_markdown + + md = blocknote_to_markdown(document.blocknote_document) + if md: + return md + + chunk_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document.id) + .order_by(Chunk.id) + ) + chunks = chunk_result.scalars().all() + if chunks: + return "\n\n".join(chunks) + + return None + + +@dataclass +class ExportResult: + zip_path: str + export_name: str + zip_size: int + skipped_docs: list[str] = field(default_factory=list) + + +async def build_export_zip( + session: AsyncSession, + search_space_id: int, + folder_id: int | None = None, +) -> ExportResult: + """Build a ZIP archive of markdown documents preserving folder structure. + + Returns an ExportResult with the path to the temp ZIP file. + The caller is responsible for streaming and cleaning up the file. + + Raises ValueError if folder_id is provided but not found. + """ + if folder_id is not None: + folder = await session.get(Folder, folder_id) + if not folder or folder.search_space_id != search_space_id: + raise ValueError("Folder not found") + target_folder_ids = set(await get_folder_subtree_ids(session, folder_id)) + else: + target_folder_ids = None + + folder_query = select(Folder).where(Folder.search_space_id == search_space_id) + if target_folder_ids is not None: + folder_query = folder_query.where(Folder.id.in_(target_folder_ids)) + folder_result = await session.execute(folder_query) + folders = list(folder_result.scalars().all()) + + folder_path_map = _build_folder_path_map(folders) + + doc_query = select(Document).where(Document.search_space_id == search_space_id) + if target_folder_ids is not None: + doc_query = doc_query.where(Document.folder_id.in_(target_folder_ids)) + doc_result = await session.execute(doc_query) + documents = list(doc_result.scalars().all()) + + fd, tmp_path = tempfile.mkstemp(suffix=".zip") + os.close(fd) + + try: + used_paths: dict[str, int] = {} + skipped_docs: list[str] = [] + + with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zf: + for doc in documents: + status = doc.status or {} + state = status.get("state", "ready") if isinstance(status, dict) else "ready" + if state in ("pending", "processing"): + skipped_docs.append(doc.title or "Untitled") + continue + + markdown = await _get_document_markdown(session, doc) + if not markdown or not markdown.strip(): + continue + + if doc.folder_id and doc.folder_id in folder_path_map: + dir_path = folder_path_map[doc.folder_id] + else: + dir_path = "" + + base_name = _sanitize_filename(doc.title or "Untitled") + file_path = f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md" + + if file_path in used_paths: + used_paths[file_path] += 1 + suffix = used_paths[file_path] + file_path = ( + f"{dir_path}/{base_name}_{suffix}.md" + if dir_path + else f"{base_name}_{suffix}.md" + ) + else: + used_paths[file_path] = 1 + + zf.writestr(file_path, markdown) + + export_name = "knowledge-base" + if folder_id is not None and folder_id in folder_path_map: + export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0]) + + return ExportResult( + zip_path=tmp_path, + export_name=export_name, + zip_size=os.path.getsize(tmp_path), + skipped_docs=skipped_docs, + ) + + except Exception: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + raise From c38239a9953375b867da70dc01f3dbd7676edd6b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 12:19:04 +0200 Subject: [PATCH 05/29] feat: wire KB export button in sidebar --- .../layout/ui/sidebar/DocumentsSidebar.tsx | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index db80c8d8d..0f925af33 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -406,6 +406,45 @@ export function DocumentsSidebar({ setFolderPickerOpen(true); }, []); + const [isExportingKB, setIsExportingKB] = useState(false); + + const handleExportKB = useCallback(async () => { + if (isExportingKB) return; + setIsExportingKB(true); + try { + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`, + { method: "GET" } + ); + if (!response.ok) { + const errorData = await response.json().catch(() => ({ detail: "Export failed" })); + throw new Error(errorData.detail || "Export failed"); + } + + const skipped = response.headers.get("X-Skipped-Documents"); + if (skipped && Number(skipped) > 0) { + toast.warning(`${skipped} document(s) were skipped (still processing)`); + } + + const blob = await response.blob(); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = "knowledge-base.zip"; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + + toast.success("Knowledge base exported"); + } catch (err) { + console.error("KB export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + }, [searchSpaceId, isExportingKB]); + const handleExportDocument = useCallback( async (doc: DocumentNodeDoc, format: string) => { const safeTitle = @@ -800,7 +839,7 @@ export function DocumentsSidebar({ onToggleType={onToggleType} activeTypes={activeTypes} onCreateFolder={() => handleCreateFolder(null)} - onExportKB={() => toast("Export KB clicked (placeholder)")} + onExportKB={handleExportKB} />
From 89f210bf7e99820319aa24053bbc142638f10a96 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 12:20:49 +0200 Subject: [PATCH 06/29] feat: add folder-level export to context menu --- .../components/documents/FolderNode.tsx | 20 +++++++++ .../components/documents/FolderTreeView.tsx | 3 ++ .../layout/ui/sidebar/DocumentsSidebar.tsx | 42 +++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 7f75f8abf..a1b437983 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -4,6 +4,7 @@ import { AlertCircle, ChevronDown, ChevronRight, + Download, Eye, EyeOff, Folder, @@ -80,6 +81,7 @@ interface FolderNodeProps { isWatched?: boolean; onRescan?: (folder: FolderDisplay) => void | Promise; onStopWatching?: (folder: FolderDisplay) => void; + onExportFolder?: (folder: FolderDisplay) => void; } function getDropZone( @@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({ isWatched, onRescan, onStopWatching, + onExportFolder, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({ Move to... + {onExportFolder && ( + { + e.stopPropagation(); + onExportFolder(folder); + }} + > + + Export folder + + )} { e.stopPropagation(); @@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({ Move to... + {onExportFolder && ( + onExportFolder(folder)}> + + Export folder + + )} onDelete(folder)}> Delete diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 6eb53da50..4988e87e7 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -44,6 +44,7 @@ interface FolderTreeViewProps { watchedFolderIds?: Set; onRescanFolder?: (folder: FolderDisplay) => void; onStopWatchingFolder?: (folder: FolderDisplay) => void; + onExportFolder?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -81,6 +82,7 @@ export function FolderTreeView({ watchedFolderIds, onRescanFolder, onStopWatchingFolder, + onExportFolder, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); @@ -259,6 +261,7 @@ export function FolderTreeView({ isWatched={watchedFolderIds?.has(f.id)} onRescan={onRescanFolder} onStopWatching={onStopWatchingFolder} + onExportFolder={onExportFolder} /> ); diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 0f925af33..853aea641 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -445,6 +445,47 @@ export function DocumentsSidebar({ } }, [searchSpaceId, isExportingKB]); + const handleExportFolder = useCallback( + async (folder: FolderDisplay) => { + try { + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`, + { method: "GET" } + ); + if (!response.ok) { + const errorData = await response.json().catch(() => ({ detail: "Export failed" })); + throw new Error(errorData.detail || "Export failed"); + } + + const skipped = response.headers.get("X-Skipped-Documents"); + if (skipped && Number(skipped) > 0) { + toast.warning(`${skipped} document(s) were skipped (still processing)`); + } + + const blob = await response.blob(); + const safeName = + folder.name + .replace(/[^a-zA-Z0-9 _-]/g, "_") + .trim() + .slice(0, 80) || "folder"; + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = `${safeName}.zip`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + + toast.success(`Folder "${folder.name}" exported`); + } catch (err) { + console.error("Folder export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } + }, + [searchSpaceId] + ); + const handleExportDocument = useCallback( async (doc: DocumentNodeDoc, format: string) => { const safeTitle = @@ -895,6 +936,7 @@ export function DocumentsSidebar({ watchedFolderIds={watchedFolderIds} onRescanFolder={handleRescanFolder} onStopWatchingFolder={handleStopWatching} + onExportFolder={handleExportFolder} /> From e7107b751dd562eb64dc0f5535219210e662f4ff Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:01:23 +0200 Subject: [PATCH 07/29] fix: strip folder prefix from filename in folder upload --- surfsense_backend/app/routes/documents_routes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 53312c647..8093084f0 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1565,7 +1565,8 @@ async def folder_upload( async def _read_and_save(file: UploadFile, idx: int) -> dict: content = await file.read() - filename = file.filename or rel_paths[idx].split("/")[-1] + raw_name = file.filename or rel_paths[idx] + filename = raw_name.split("/")[-1] def _write_temp() -> str: with tempfile.NamedTemporaryFile( From 7a7792fc799756b3555aaa943c5462b1629c3fe0 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:11:43 +0200 Subject: [PATCH 08/29] feat: warn before export when documents are processing --- .../layout/ui/sidebar/DocumentsSidebar.tsx | 171 +++++++++++++----- 1 file changed, 123 insertions(+), 48 deletions(-) diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 853aea641..ef25d3056 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -407,35 +407,54 @@ export function DocumentsSidebar({ }, []); const [isExportingKB, setIsExportingKB] = useState(false); + const [exportWarningOpen, setExportWarningOpen] = useState(false); + const [exportWarningContext, setExportWarningContext] = useState<{ + type: "kb" | "folder"; + folder?: FolderDisplay; + pendingCount: number; + } | null>(null); + + const pendingDocuments = useMemo( + () => + treeDocuments.filter( + (d) => d.status?.state === "pending" || d.status?.state === "processing" + ), + [treeDocuments] + ); + + const doExport = useCallback(async (url: string, downloadName: string) => { + const response = await authenticatedFetch(url, { method: "GET" }); + if (!response.ok) { + const errorData = await response.json().catch(() => ({ detail: "Export failed" })); + throw new Error(errorData.detail || "Export failed"); + } + + const blob = await response.blob(); + const blobUrl = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = blobUrl; + a.download = downloadName; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(blobUrl); + }, []); const handleExportKB = useCallback(async () => { if (isExportingKB) return; + + if (pendingDocuments.length > 0) { + setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length }); + setExportWarningOpen(true); + return; + } + setIsExportingKB(true); try { - const response = await authenticatedFetch( + await doExport( `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`, - { method: "GET" } + "knowledge-base.zip" ); - if (!response.ok) { - const errorData = await response.json().catch(() => ({ detail: "Export failed" })); - throw new Error(errorData.detail || "Export failed"); - } - - const skipped = response.headers.get("X-Skipped-Documents"); - if (skipped && Number(skipped) > 0) { - toast.warning(`${skipped} document(s) were skipped (still processing)`); - } - - const blob = await response.blob(); - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.href = url; - a.download = "knowledge-base.zip"; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - URL.revokeObjectURL(url); - toast.success("Knowledge base exported"); } catch (err) { console.error("KB export failed:", err); @@ -443,47 +462,76 @@ export function DocumentsSidebar({ } finally { setIsExportingKB(false); } - }, [searchSpaceId, isExportingKB]); + }, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]); + + const handleExportWarningConfirm = useCallback(async () => { + setExportWarningOpen(false); + const ctx = exportWarningContext; + if (!ctx) return; + + if (ctx.type === "kb") { + setIsExportingKB(true); + try { + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`, + "knowledge-base.zip" + ); + toast.success("Knowledge base exported"); + } catch (err) { + console.error("KB export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + } else if (ctx.type === "folder" && ctx.folder) { + try { + const safeName = + ctx.folder.name + .replace(/[^a-zA-Z0-9 _-]/g, "_") + .trim() + .slice(0, 80) || "folder"; + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`, + `${safeName}.zip` + ); + toast.success(`Folder "${ctx.folder.name}" exported`); + } catch (err) { + console.error("Folder export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } + } + setExportWarningContext(null); + }, [exportWarningContext, searchSpaceId, doExport]); const handleExportFolder = useCallback( async (folder: FolderDisplay) => { + if (pendingDocuments.length > 0) { + setExportWarningContext({ + type: "folder", + folder, + pendingCount: pendingDocuments.length, + }); + setExportWarningOpen(true); + return; + } + try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`, - { method: "GET" } - ); - if (!response.ok) { - const errorData = await response.json().catch(() => ({ detail: "Export failed" })); - throw new Error(errorData.detail || "Export failed"); - } - - const skipped = response.headers.get("X-Skipped-Documents"); - if (skipped && Number(skipped) > 0) { - toast.warning(`${skipped} document(s) were skipped (still processing)`); - } - - const blob = await response.blob(); const safeName = folder.name .replace(/[^a-zA-Z0-9 _-]/g, "_") .trim() .slice(0, 80) || "folder"; - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.href = url; - a.download = `${safeName}.zip`; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - URL.revokeObjectURL(url); - + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`, + `${safeName}.zip` + ); toast.success(`Folder "${folder.name}" exported`); } catch (err) { console.error("Folder export failed:", err); toast.error(err instanceof Error ? err.message : "Export failed"); } }, - [searchSpaceId] + [searchSpaceId, pendingDocuments.length, doExport] ); const handleExportDocument = useCallback( @@ -1015,6 +1063,33 @@ export function DocumentsSidebar({ + + { + if (!open) { + setExportWarningOpen(false); + setExportWarningContext(null); + } + }} + > + + + Some documents are still processing + + {exportWarningContext?.pendingCount} document + {exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed + and will be excluded from the export. Do you want to continue? + + + + Cancel + + Export anyway + + + + ); From b5f6e44fc3a0e34782f51d12b4a9e41b4270e878 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:39:36 +0200 Subject: [PATCH 09/29] security: sanitize folder names in ZIP export paths --- surfsense_backend/app/services/export_service.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py index 49f1a127a..0804e6042 100644 --- a/surfsense_backend/app/services/export_service.py +++ b/surfsense_backend/app/services/export_service.py @@ -29,10 +29,11 @@ def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]: if folder_id in cache: return cache[folder_id] folder = id_to_folder[folder_id] + safe_name = _sanitize_filename(folder.name) if folder.parent_id is None or folder.parent_id not in id_to_folder: - cache[folder_id] = folder.name + cache[folder_id] = safe_name else: - cache[folder_id] = f"{resolve(folder.parent_id)}/{folder.name}" + cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}" return cache[folder_id] for f in folders: From a81fff299ac57e924ee9d6b40fdc87858a3e2a42 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:40:43 +0200 Subject: [PATCH 10/29] fix: scope pending doc warning to folder subtree on folder export --- .../layout/ui/sidebar/DocumentsSidebar.tsx | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index ef25d3056..041f03ca7 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -503,13 +503,33 @@ export function DocumentsSidebar({ setExportWarningContext(null); }, [exportWarningContext, searchSpaceId, doExport]); + const getPendingCountInSubtree = useCallback( + (folderId: number): number => { + const subtreeIds = new Set(); + function collect(id: number) { + subtreeIds.add(id); + for (const child of foldersByParent[String(id)] ?? []) { + collect(child.id); + } + } + collect(folderId); + return treeDocuments.filter( + (d) => + subtreeIds.has(d.folderId ?? -1) && + (d.status?.state === "pending" || d.status?.state === "processing") + ).length; + }, + [foldersByParent, treeDocuments] + ); + const handleExportFolder = useCallback( async (folder: FolderDisplay) => { - if (pendingDocuments.length > 0) { + const folderPendingCount = getPendingCountInSubtree(folder.id); + if (folderPendingCount > 0) { setExportWarningContext({ type: "folder", folder, - pendingCount: pendingDocuments.length, + pendingCount: folderPendingCount, }); setExportWarningOpen(true); return; @@ -531,7 +551,7 @@ export function DocumentsSidebar({ toast.error(err instanceof Error ? err.message : "Export failed"); } }, - [searchSpaceId, pendingDocuments.length, doExport] + [searchSpaceId, getPendingCountInSubtree, doExport] ); const handleExportDocument = useCallback( From 7851db792810aebe597a0f1d588f66a98a278753 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:42:57 +0200 Subject: [PATCH 11/29] fix: add i18n keys for folder upload strings --- surfsense_web/components/sources/DocumentUploadTab.tsx | 6 +++--- surfsense_web/messages/en.json | 6 +++++- surfsense_web/messages/es.json | 6 +++++- surfsense_web/messages/hi.json | 6 +++++- surfsense_web/messages/pt.json | 6 +++++- surfsense_web/messages/zh.json | 6 +++++- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 5fc8e3fd3..0f8ac298d 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -540,7 +540,7 @@ export function DocumentUploadTab({

- {isElectron ? "Select files or folder" : "Tap to select files or folder"} + {isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}

{t("file_size_limit")}

@@ -672,7 +672,7 @@ export function DocumentUploadTab({ {isAnyUploading && (
- {folderUpload ? "Uploading folder…" : t("uploading_files")} + {folderUpload ? t("uploading_folder") : t("uploading_files")} {Math.round(uploadProgress)}%
@@ -702,7 +702,7 @@ export function DocumentUploadTab({ ) : ( {folderUpload - ? `Upload Folder (${fileCount} files)` + ? t("upload_folder_button", { count: fileCount }) : t("upload_button", { count: fileCount })} )} diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index a3a4e8853..cef48663f 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -396,7 +396,11 @@ "supported_file_types": "Supported File Types", "file_too_large": "File Too Large", "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.", - "no_supported_files_in_folder": "No supported file types found in the selected folder." + "no_supported_files_in_folder": "No supported file types found in the selected folder.", + "uploading_folder": "Uploading folder…", + "upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})", + "select_files_or_folder": "Select files or folder", + "tap_select_files_or_folder": "Tap to select files or folder" }, "add_webpage": { "title": "Add Webpages for Crawling", diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json index fa620e271..88154d1fc 100644 --- a/surfsense_web/messages/es.json +++ b/surfsense_web/messages/es.json @@ -396,7 +396,11 @@ "supported_file_types": "Tipos de archivo soportados", "file_too_large": "Archivo demasiado grande", "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.", - "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada." + "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.", + "uploading_folder": "Subiendo carpeta…", + "upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})", + "select_files_or_folder": "Seleccionar archivos o carpeta", + "tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta" }, "add_webpage": { "title": "Agregar páginas web para rastreo", diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json index faeb4cb94..988894714 100644 --- a/surfsense_web/messages/hi.json +++ b/surfsense_web/messages/hi.json @@ -396,7 +396,11 @@ "supported_file_types": "समर्थित फ़ाइल प्रकार", "file_too_large": "फ़ाइल बहुत बड़ी है", "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।", - "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।" + "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।", + "uploading_folder": "फ़ोल्डर अपलोड हो रहा है…", + "upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})", + "select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें", + "tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें" }, "add_webpage": { "title": "क्रॉलिंग के लिए वेबपेज जोड़ें", diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json index 0bed7c6cc..b34546da9 100644 --- a/surfsense_web/messages/pt.json +++ b/surfsense_web/messages/pt.json @@ -396,7 +396,11 @@ "supported_file_types": "Tipos de arquivo suportados", "file_too_large": "Arquivo muito grande", "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.", - "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada." + "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.", + "uploading_folder": "Enviando pasta…", + "upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})", + "select_files_or_folder": "Selecionar arquivos ou pasta", + "tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta" }, "add_webpage": { "title": "Adicionar páginas web para rastreamento", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 0d4f7e1c9..a42e59f6f 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -380,7 +380,11 @@ "supported_file_types": "支持的文件类型", "file_too_large": "文件过大", "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。", - "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。" + "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。", + "uploading_folder": "正在上传文件夹…", + "upload_folder_button": "上传文件夹({count}个文件)", + "select_files_or_folder": "选择文件或文件夹", + "tap_select_files_or_folder": "点击选择文件或文件夹" }, "add_webpage": { "title": "添加网页爬取", From 1af5725bd1759416cbff20a91d9fd98bc775b017 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 13:51:32 +0200 Subject: [PATCH 12/29] fix: track dedup'd filename in used_paths to prevent collisions --- surfsense_backend/app/services/export_service.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py index 0804e6042..2d36bfaab 100644 --- a/surfsense_backend/app/services/export_service.py +++ b/surfsense_backend/app/services/export_service.py @@ -147,8 +147,7 @@ async def build_export_zip( if dir_path else f"{base_name}_{suffix}.md" ) - else: - used_paths[file_path] = 1 + used_paths[file_path] = used_paths.get(file_path, 0) + 1 zf.writestr(file_path, markdown) From 78fa2d926a94cefdf5bfe25163ba861cf2914074 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 14:00:25 +0200 Subject: [PATCH 13/29] feat: show spinner on export button during export --- .../components/documents/DocumentsFilters.tsx | 15 ++++++++++++--- .../layout/ui/sidebar/DocumentsSidebar.tsx | 7 +++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx index abd65637c..703c9c3b4 100644 --- a/surfsense_web/components/documents/DocumentsFilters.tsx +++ b/surfsense_web/components/documents/DocumentsFilters.tsx @@ -1,6 +1,6 @@ "use client"; -import { Download, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; +import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import React, { useCallback, useMemo, useRef, useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; @@ -21,6 +21,7 @@ export function DocumentsFilters({ activeTypes, onCreateFolder, onExportKB, + isExporting, }: { typeCounts: Partial>; onSearch: (v: string) => void; @@ -29,6 +30,7 @@ export function DocumentsFilters({ activeTypes: DocumentTypeEnum[]; onCreateFolder?: () => void; onExportKB?: () => void; + isExporting?: boolean; }) { const t = useTranslations("documents"); const id = React.useId(); @@ -91,16 +93,23 @@ export function DocumentsFilters({ { e.preventDefault(); onExportKB(); }} > - + {isExporting ? ( + + ) : ( + + )} - Export knowledge base + + {isExporting ? "Exporting…" : "Export knowledge base"} + )} diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 041f03ca7..20b25a2d2 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -484,6 +484,7 @@ export function DocumentsSidebar({ setIsExportingKB(false); } } else if (ctx.type === "folder" && ctx.folder) { + setIsExportingKB(true); try { const safeName = ctx.folder.name @@ -498,6 +499,8 @@ export function DocumentsSidebar({ } catch (err) { console.error("Folder export failed:", err); toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); } } setExportWarningContext(null); @@ -535,6 +538,7 @@ export function DocumentsSidebar({ return; } + setIsExportingKB(true); try { const safeName = folder.name @@ -549,6 +553,8 @@ export function DocumentsSidebar({ } catch (err) { console.error("Folder export failed:", err); toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); } }, [searchSpaceId, getPendingCountInSubtree, doExport] @@ -949,6 +955,7 @@ export function DocumentsSidebar({ activeTypes={activeTypes} onCreateFolder={() => handleCreateFolder(null)} onExportKB={handleExportKB} + isExporting={isExportingKB} />
From 7e90a8ed3c598de0fe27ef0c007bbd5fa9e38bf0 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 14:33:33 +0200 Subject: [PATCH 14/29] Route uploaded images to vision LLM with document-parser fallback --- .../app/etl_pipeline/etl_pipeline_service.py | 25 ++++++++ .../app/etl_pipeline/file_classifier.py | 10 ++- .../app/etl_pipeline/parsers/vision_llm.py | 37 +++++++++++ .../document_processors/file_processors.py | 11 +++- .../app/utils/file_extensions.py | 23 +++++++ .../etl_pipeline/test_etl_pipeline_service.py | 61 ++++++++++++++++++- .../tests/unit/utils/test_file_extensions.py | 37 +++++++++++ 7 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 surfsense_backend/app/etl_pipeline/parsers/vision_llm.py diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index fbd2e4e73..5f1495cdb 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext class EtlPipelineService: """Single pipeline for extracting markdown from files. All callers use this.""" + def __init__(self, *, vision_llm=None): + self._vision_llm = vision_llm + async def extract(self, request: EtlRequest) -> EtlResult: category = classify_file(request.filename) @@ -47,6 +50,28 @@ class EtlPipelineService: content_type="audio", ) + if category == FileCategory.IMAGE: + return await self._extract_image(request) + + return await self._extract_document(request) + + async def _extract_image(self, request: EtlRequest) -> EtlResult: + if self._vision_llm: + from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm + + content = await parse_with_vision_llm( + request.file_path, request.filename, self._vision_llm + ) + return EtlResult( + markdown_content=content, + etl_service="VISION_LLM", + content_type="image", + ) + + logging.info( + "No vision LLM provided, falling back to document parser for %s", + request.filename, + ) return await self._extract_document(request) async def _extract_document(self, request: EtlRequest) -> EtlResult: diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py index 4e690bcdc..120369a27 100644 --- a/surfsense_backend/app/etl_pipeline/file_classifier.py +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -3,6 +3,7 @@ from pathlib import PurePosixPath from app.utils.file_extensions import ( DOCUMENT_EXTENSIONS, + IMAGE_EXTENSIONS, get_document_extensions_for_service, ) @@ -105,6 +106,7 @@ class FileCategory(Enum): PLAINTEXT = "plaintext" AUDIO = "audio" DIRECT_CONVERT = "direct_convert" + IMAGE = "image" UNSUPPORTED = "unsupported" DOCUMENT = "document" @@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory: return FileCategory.AUDIO if suffix in DIRECT_CONVERT_EXTENSIONS: return FileCategory.DIRECT_CONVERT + if suffix in IMAGE_EXTENSIONS: + return FileCategory.IMAGE if suffix in DOCUMENT_EXTENSIONS: return FileCategory.DOCUMENT return FileCategory.UNSUPPORTED @@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool: """Return True if *filename* cannot be processed by *etl_service*. Plaintext, audio, and direct-convert files are parser-agnostic and never - skipped. Document files are checked against the per-parser extension set. + skipped. Image and document files are checked against the per-parser + extension set (images fall back to the document parser when no vision LLM + is available, so the same service constraint applies). """ category = classify_file(filename) if category == FileCategory.UNSUPPORTED: return True - if category == FileCategory.DOCUMENT: + if category in (FileCategory.DOCUMENT, FileCategory.IMAGE): suffix = PurePosixPath(filename).suffix.lower() return suffix not in get_document_extensions_for_service(etl_service) return False diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py new file mode 100644 index 000000000..e75f81c4b --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -0,0 +1,37 @@ +import base64 +import mimetypes + +from langchain_core.messages import HumanMessage + +_PROMPT = ( + "Analyze this image thoroughly and produce a detailed markdown description.\n\n" + "Include:\n" + "- All visible text, transcribed verbatim\n" + "- Description of diagrams, charts, tables, or visual structures\n" + "- Key subjects, objects, or scenes depicted\n\n" + "Output only the markdown content, no preamble." +) + + +def _image_to_data_url(file_path: str) -> str: + mime_type, _ = mimetypes.guess_type(file_path) + if not mime_type or not mime_type.startswith("image/"): + mime_type = "image/png" + with open(file_path, "rb") as f: + encoded = base64.b64encode(f.read()).decode("ascii") + return f"data:{mime_type};base64,{encoded}" + + +async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str: + data_url = _image_to_data_url(file_path) + message = HumanMessage( + content=[ + {"type": "text", "text": _PROMPT}, + {"type": "image_url", "image_url": {"url": data_url}}, + ] + ) + response = await llm.ainvoke([message]) + text = response.content if hasattr(response, "content") else str(response) + if not text or not text.strip(): + raise ValueError(f"Vision LLM returned empty content for {filename}") + return text.strip() diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index c765dbd87..9992231e0 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -333,6 +333,7 @@ async def process_file_in_background( async def _extract_file_content( file_path: str, filename: str, + search_space_id: int, session: AsyncSession, user_id: str, task_logger: TaskLoggingService, @@ -360,6 +361,7 @@ async def _extract_file_content( FileCategory.PLAINTEXT: "Reading file", FileCategory.DIRECT_CONVERT: "Converting file", FileCategory.AUDIO: "Transcribing audio", + FileCategory.IMAGE: "Analyzing image", FileCategory.UNSUPPORTED: "Unsupported file type", FileCategory.DOCUMENT: "Extracting content", } @@ -383,7 +385,13 @@ async def _extract_file_content( estimated_pages = _estimate_pages_safe(page_limit_service, file_path) await page_limit_service.check_page_limit(user_id, estimated_pages) - result = await EtlPipelineService().extract( + vision_llm = None + if category == FileCategory.IMAGE: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest( file_path=file_path, filename=filename, @@ -439,6 +447,7 @@ async def process_file_in_background_with_document( markdown_content, etl_service = await _extract_file_content( file_path, filename, + search_space_id, session, user_id, task_logger, diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index 16ac585b7..e8be1b83a 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these sets are exclusively for the "document" ETL path (Docling / LlamaParse / Unstructured). + +Image extensions intentionally remain in the per-parser sets for fallback +compatibility. IMAGE_EXTENSIONS is used only for routing classification. """ from pathlib import PurePosixPath +# --------------------------------------------------------------------------- +# Image extensions (used by file_classifier for routing to vision LLM) +# --------------------------------------------------------------------------- + +IMAGE_EXTENSIONS: frozenset[str] = frozenset( + { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + } +) + # --------------------------------------------------------------------------- # Per-parser document extension sets (from official documentation) # --------------------------------------------------------------------------- diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index 9608b011d..4e1d603a3 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename): ("doc.docx", "document"), ("slides.pptx", "document"), ("sheet.xlsx", "document"), - ("photo.png", "document"), - ("photo.jpg", "document"), + ("photo.png", "image"), + ("photo.jpg", "image"), + ("photo.webp", "image"), + ("photo.gif", "image"), + ("photo.heic", "image"), ("book.epub", "document"), ("letter.odt", "document"), ("readme.md", "plaintext"), @@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): await EtlPipelineService().extract( EtlRequest(file_path=str(eml_file), filename="mail.eml") ) + + +# --------------------------------------------------------------------------- +# Image extraction via vision LLM +# --------------------------------------------------------------------------- + + +async def test_extract_image_with_vision_llm(tmp_path): + """An image file is analyzed by the vision LLM when provided.""" + from unittest.mock import AsyncMock, MagicMock + + img_file = tmp_path / "photo.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + fake_response = MagicMock() + fake_response.content = "# A photo of a sunset over the ocean" + fake_llm = AsyncMock() + fake_llm.ainvoke.return_value = fake_response + + service = EtlPipelineService(vision_llm=fake_llm) + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="photo.png") + ) + + assert result.markdown_content == "# A photo of a sunset over the ocean" + assert result.etl_service == "VISION_LLM" + assert result.content_type == "image" + fake_llm.ainvoke.assert_called_once() + + +async def test_extract_image_falls_back_to_document_without_vision_llm( + tmp_path, mocker +): + """Without a vision LLM, image files fall back to the document parser.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "# OCR text from image"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + img_file = tmp_path / "scan.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + service = EtlPipelineService() + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="scan.png") + ) + + assert result.markdown_content == "# OCR text from image" + assert result.etl_service == "DOCLING" + assert result.content_type == "document" diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py index 43dfef5f0..ccf5eb70f 100644 --- a/surfsense_backend/tests/unit/utils/test_file_extensions.py +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union(): ) assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS + + +# --------------------------------------------------------------------------- +# IMAGE_EXTENSIONS +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "ext", + [ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + ], +) +def test_image_extensions_contains_expected(ext): + from app.utils.file_extensions import IMAGE_EXTENSIONS + + assert ext in IMAGE_EXTENSIONS + + +def test_image_extensions_are_subset_of_document_extensions(): + """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback.""" + from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS + + missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS + assert not missing, ( + f"Image extensions missing from document sets (breaks fallback): {missing}" + ) From afd3c2cde20106085df0c2f5b8ff168620fd503d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 14:50:24 +0200 Subject: [PATCH 15/29] Pass vision LLM through local folder indexer call chain --- .../local_folder_indexer.py | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index f503ff864..f88d313da 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -153,16 +153,16 @@ def scan_folder( return files -async def _read_file_content(file_path: str, filename: str) -> str: +async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str: """Read file content via the unified ETL pipeline. - All file types (plaintext, audio, direct-convert, document) are handled - by ``EtlPipelineService``. + All file types (plaintext, audio, direct-convert, document, image) are + handled by ``EtlPipelineService``. """ from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content @@ -199,12 +199,14 @@ async def _compute_file_content_hash( file_path: str, filename: str, search_space_id: int, + *, + vision_llm=None, ) -> tuple[str, str]: """Read a file (via ETL if needed) and compute its content hash. Returns (content_text, content_hash). """ - content = await _read_file_content(file_path, filename) + content = await _read_file_content(file_path, filename, vision_llm=vision_llm) return content, _content_hash(content, search_space_id) @@ -635,6 +637,10 @@ async def index_local_folder( page_limit_service = PageLimitService(session) + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + # ================================================================ # PHASE 1: Pre-filter files (mtime / content-hash), version changed # ================================================================ @@ -704,7 +710,10 @@ async def index_local_folder( try: content, content_hash = await _compute_file_content_hash( - file_path_abs, file_info["relative_path"], search_space_id + file_path_abs, + file_info["relative_path"], + search_space_id, + vision_llm=vision_llm, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -738,7 +747,10 @@ async def index_local_folder( try: content, content_hash = await _compute_file_content_hash( - file_path_abs, file_info["relative_path"], search_space_id + file_path_abs, + file_info["relative_path"], + search_space_id, + vision_llm=vision_llm, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -1080,9 +1092,13 @@ async def _index_single_file( except PageLimitExceededError as e: return 0, 1, f"Page limit exceeded: {e}" + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + try: content, content_hash = await _compute_file_content_hash( - str(full_path), full_path.name, search_space_id + str(full_path), full_path.name, search_space_id, vision_llm=vision_llm ) except Exception as e: return 0, 1, f"Could not read file: {e}" @@ -1300,6 +1316,10 @@ async def index_uploaded_files( pipeline = IndexingPipelineService(session) llm = await get_user_long_context_llm(session, user_id, search_space_id) + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + indexed_count = 0 failed_count = 0 errors: list[str] = [] @@ -1347,7 +1367,7 @@ async def index_uploaded_files( try: content, content_hash = await _compute_file_content_hash( - temp_path, filename, search_space_id + temp_path, filename, search_space_id, vision_llm=vision_llm ) except Exception as e: logger.warning(f"Could not read {relative_path}: {e}") From caaec2e0a798a500b45a737b1bf7924ed12546db Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 14:56:18 +0200 Subject: [PATCH 16/29] Simplify vision LLM image description prompt --- surfsense_backend/app/etl_pipeline/parsers/vision_llm.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py index e75f81c4b..fb12a1e75 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -4,12 +4,9 @@ import mimetypes from langchain_core.messages import HumanMessage _PROMPT = ( - "Analyze this image thoroughly and produce a detailed markdown description.\n\n" - "Include:\n" - "- All visible text, transcribed verbatim\n" - "- Description of diagrams, charts, tables, or visual structures\n" - "- Key subjects, objects, or scenes depicted\n\n" - "Output only the markdown content, no preamble." + "Describe this image in markdown. " + "Transcribe any visible text verbatim. " + "Be concise but complete — let the image content guide the level of detail." ) From d6c4fb8938927c6376d196aad6be2bf0d501650f Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 15:11:24 +0200 Subject: [PATCH 17/29] Add try/except fallback in _extract_image for vision LLM failures --- .../app/etl_pipeline/etl_pipeline_service.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 5f1495cdb..56ade32fb 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -57,16 +57,23 @@ class EtlPipelineService: async def _extract_image(self, request: EtlRequest) -> EtlResult: if self._vision_llm: - from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm + try: + from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm - content = await parse_with_vision_llm( - request.file_path, request.filename, self._vision_llm - ) - return EtlResult( - markdown_content=content, - etl_service="VISION_LLM", - content_type="image", - ) + content = await parse_with_vision_llm( + request.file_path, request.filename, self._vision_llm + ) + return EtlResult( + markdown_content=content, + etl_service="VISION_LLM", + content_type="image", + ) + except Exception: + logging.warning( + "Vision LLM failed for %s, falling back to document parser", + request.filename, + exc_info=True, + ) logging.info( "No vision LLM provided, falling back to document parser for %s", From 71db53fc553a50e664d4ec5493960d86d9a52446 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 15:17:08 +0200 Subject: [PATCH 18/29] Add 5MB file size guard before base64 encoding for vision LLM --- surfsense_backend/app/etl_pipeline/parsers/vision_llm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py index fb12a1e75..bd39de71d 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -1,5 +1,6 @@ import base64 import mimetypes +import os from langchain_core.messages import HumanMessage @@ -9,8 +10,16 @@ _PROMPT = ( "Be concise but complete — let the image content guide the level of detail." ) +_MAX_IMAGE_BYTES = 5 * 1024 * 1024 # 5 MB (Anthropic Claude's limit, the most restrictive) + def _image_to_data_url(file_path: str) -> str: + file_size = os.path.getsize(file_path) + if file_size > _MAX_IMAGE_BYTES: + raise ValueError( + f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, " + f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}" + ) mime_type, _ = mimetypes.guess_type(file_path) if not mime_type or not mime_type.startswith("image/"): mime_type = "image/png" From 55661bcde68b5b15b034b647ee414fde03aa29ef Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 15:21:32 +0200 Subject: [PATCH 19/29] Replace mimetypes fallback with explicit extension-to-MIME mapping --- .../app/etl_pipeline/parsers/vision_llm.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py index bd39de71d..d3b778801 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -1,5 +1,4 @@ import base64 -import mimetypes import os from langchain_core.messages import HumanMessage @@ -10,7 +9,23 @@ _PROMPT = ( "Be concise but complete — let the image content guide the level of detail." ) -_MAX_IMAGE_BYTES = 5 * 1024 * 1024 # 5 MB (Anthropic Claude's limit, the most restrictive) +_MAX_IMAGE_BYTES = ( + 5 * 1024 * 1024 +) # 5 MB (Anthropic Claude's limit, the most restrictive) + +_EXT_TO_MIME: dict[str, str] = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".bmp": "image/bmp", + ".tiff": "image/tiff", + ".tif": "image/tiff", + ".webp": "image/webp", + ".svg": "image/svg+xml", + ".heic": "image/heic", + ".heif": "image/heif", +} def _image_to_data_url(file_path: str) -> str: @@ -20,9 +35,10 @@ def _image_to_data_url(file_path: str) -> str: f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, " f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}" ) - mime_type, _ = mimetypes.guess_type(file_path) - if not mime_type or not mime_type.startswith("image/"): - mime_type = "image/png" + ext = os.path.splitext(file_path)[1].lower() + mime_type = _EXT_TO_MIME.get(ext) + if not mime_type: + raise ValueError(f"Unsupported image extension {ext!r}: {file_path}") with open(file_path, "rb") as f: encoded = base64.b64encode(f.read()).decode("ascii") return f"data:{mime_type};base64,{encoded}" From ff2a9c77f9eb6c52384c44ddfd344a9645abc49f Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 15:28:21 +0200 Subject: [PATCH 20/29] Pass vision_llm in legacy process_file_in_background path --- .../tasks/document_processors/file_processors.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 9992231e0..cd06657dc 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -118,9 +118,13 @@ async def _log_page_divergence( async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None: - """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline.""" + """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + from app.etl_pipeline.file_classifier import ( + FileCategory, + classify_file as etl_classify, + ) await _notify(ctx, "parsing", "Processing file") await ctx.task_logger.log_task_progress( @@ -129,7 +133,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No {"processing_stage": "extracting"}, ) - etl_result = await EtlPipelineService().extract( + vision_llm = None + if etl_classify(ctx.filename) == FileCategory.IMAGE: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id) + + etl_result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=ctx.file_path, filename=ctx.filename) ) From e164fe061249c78c41d0505f6594df60f75d12c6 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 15:29:39 +0200 Subject: [PATCH 21/29] Fix misleading log when vision LLM fails vs not provided --- .../app/etl_pipeline/etl_pipeline_service.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 56ade32fb..2e1a803d8 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -74,11 +74,12 @@ class EtlPipelineService: request.filename, exc_info=True, ) + else: + logging.info( + "No vision LLM provided, falling back to document parser for %s", + request.filename, + ) - logging.info( - "No vision LLM provided, falling back to document parser for %s", - request.filename, - ) return await self._extract_document(request) async def _extract_document(self, request: EtlRequest) -> EtlResult: From 4ccdd80e264b4adb30c142c9d888f8d46c2d3639 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 16:14:53 +0200 Subject: [PATCH 22/29] Harden vision LLM fallback, folder upload validation, and export memory --- .../app/etl_pipeline/etl_pipeline_service.py | 9 +++- .../app/etl_pipeline/parsers/vision_llm.py | 7 ++- .../app/routes/documents_routes.py | 24 ++++++++- .../app/services/export_service.py | 54 +++++++++++++++---- 4 files changed, 79 insertions(+), 15 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 2e1a803d8..b4438ce4d 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -80,7 +80,14 @@ class EtlPipelineService: request.filename, ) - return await self._extract_document(request) + try: + return await self._extract_document(request) + except (EtlUnsupportedFileError, EtlServiceUnavailableError): + raise EtlUnsupportedFileError( + f"Cannot process image {request.filename}: vision LLM " + f"{'failed' if self._vision_llm else 'not configured'} and " + f"document parser does not support this format" + ) from None async def _extract_document(self, request: EtlRequest) -> EtlResult: from pathlib import PurePosixPath diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py index d3b778801..c80fbca0a 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -1,3 +1,4 @@ +import asyncio import base64 import os @@ -13,6 +14,8 @@ _MAX_IMAGE_BYTES = ( 5 * 1024 * 1024 ) # 5 MB (Anthropic Claude's limit, the most restrictive) +_INVOKE_TIMEOUT_SECONDS = 120 + _EXT_TO_MIME: dict[str, str] = { ".png": "image/png", ".jpg": "image/jpeg", @@ -52,7 +55,9 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str: {"type": "image_url", "image_url": {"url": data_url}}, ] ) - response = await llm.ainvoke([message]) + response = await asyncio.wait_for( + llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS + ) text = response.content if hasattr(response, "content") else str(response) if not text or not text.strip(): raise ValueError(f"Vision LLM returned empty content for {filename}") diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 8093084f0..25841a107 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -2,7 +2,7 @@ import asyncio from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile -from pydantic import BaseModel as PydanticBaseModel +from pydantic import BaseModel as PydanticBaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -1395,10 +1395,13 @@ class FolderMtimeCheckFile(PydanticBaseModel): mtime: float +_MAX_MTIME_CHECK_FILES = 10_000 + + class FolderMtimeCheckRequest(PydanticBaseModel): folder_name: str search_space_id: int - files: list[FolderMtimeCheckFile] + files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES) class FolderUnlinkRequest(PydanticBaseModel): @@ -1531,6 +1534,23 @@ async def folder_upload( f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.", ) + from app.services.folder_service import MAX_FOLDER_DEPTH + + max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0) + if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH: + raise HTTPException( + status_code=400, + detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels " + f"exceeds the maximum of {MAX_FOLDER_DEPTH}.", + ) + + if root_folder_id: + root_folder = await session.get(Folder, root_folder_id) + if not root_folder or root_folder.search_space_id != search_space_id: + raise HTTPException( + status_code=404, detail="Root folder not found in this search space" + ) + if not root_folder_id: watched_metadata = { "watched": True, diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py index 2d36bfaab..97f952223 100644 --- a/surfsense_backend/app/services/export_service.py +++ b/surfsense_backend/app/services/export_service.py @@ -1,5 +1,6 @@ """Service for exporting knowledge base content as a ZIP archive.""" +import asyncio import logging import os import tempfile @@ -106,23 +107,38 @@ async def build_export_zip( folder_path_map = _build_folder_path_map(folders) - doc_query = select(Document).where(Document.search_space_id == search_space_id) + batch_size = 100 + + base_doc_query = select(Document).where(Document.search_space_id == search_space_id) if target_folder_ids is not None: - doc_query = doc_query.where(Document.folder_id.in_(target_folder_ids)) - doc_result = await session.execute(doc_query) - documents = list(doc_result.scalars().all()) + base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids)) + base_doc_query = base_doc_query.order_by(Document.id) fd, tmp_path = tempfile.mkstemp(suffix=".zip") os.close(fd) - try: - used_paths: dict[str, int] = {} - skipped_docs: list[str] = [] + used_paths: dict[str, int] = {} + skipped_docs: list[str] = [] + is_first_batch = True + + try: + offset = 0 + while True: + batch_query = base_doc_query.limit(batch_size).offset(offset) + batch_result = await session.execute(batch_query) + documents = list(batch_result.scalars().all()) + if not documents: + break + + entries: list[tuple[str, str]] = [] - with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zf: for doc in documents: status = doc.status or {} - state = status.get("state", "ready") if isinstance(status, dict) else "ready" + state = ( + status.get("state", "ready") + if isinstance(status, dict) + else "ready" + ) if state in ("pending", "processing"): skipped_docs.append(doc.title or "Untitled") continue @@ -137,7 +153,9 @@ async def build_export_zip( dir_path = "" base_name = _sanitize_filename(doc.title or "Untitled") - file_path = f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md" + file_path = ( + f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md" + ) if file_path in used_paths: used_paths[file_path] += 1 @@ -149,7 +167,21 @@ async def build_export_zip( ) used_paths[file_path] = used_paths.get(file_path, 0) + 1 - zf.writestr(file_path, markdown) + entries.append((file_path, markdown)) + + if entries: + mode = "w" if is_first_batch else "a" + batch_entries = entries + + def _write_batch(m: str = mode, e: list = batch_entries) -> None: + with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf: + for path, content in e: + zf.writestr(path, content) + + await asyncio.to_thread(_write_batch) + is_first_batch = False + + offset += batch_size export_name = "knowledge-base" if folder_id is not None and folder_id in folder_path_map: From 7e14df6012481b43a0f849910d27120bf60e5998 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 16:40:55 +0200 Subject: [PATCH 23/29] Fix button-in-button hydration error in mobile upload drop zone --- .../components/sources/DocumentUploadTab.tsx | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 0f8ac298d..117d376ec 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -530,28 +530,35 @@ export function DocumentUploadTab({ ) ) : ( - + {renderBrowseButton({ fullWidth: true })} + + )} From 0aefcbd504c3a4a3539969ea2a69c4522635f23f Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 9 Apr 2026 22:06:06 +0200 Subject: [PATCH 24/29] Remove vision LLM from desktop folder watcher --- .../local_folder_indexer.py | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index f88d313da..a531916e1 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -153,7 +153,7 @@ def scan_folder( return files -async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str: +async def _read_file_content(file_path: str, filename: str) -> str: """Read file content via the unified ETL pipeline. All file types (plaintext, audio, direct-convert, document, image) are @@ -162,7 +162,7 @@ async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService(vision_llm=vision_llm).extract( + result = await EtlPipelineService().extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content @@ -199,14 +199,12 @@ async def _compute_file_content_hash( file_path: str, filename: str, search_space_id: int, - *, - vision_llm=None, ) -> tuple[str, str]: """Read a file (via ETL if needed) and compute its content hash. Returns (content_text, content_hash). """ - content = await _read_file_content(file_path, filename, vision_llm=vision_llm) + content = await _read_file_content(file_path, filename) return content, _content_hash(content, search_space_id) @@ -637,10 +635,6 @@ async def index_local_folder( page_limit_service = PageLimitService(session) - from app.services.llm_service import get_vision_llm - - vision_llm = await get_vision_llm(session, search_space_id) - # ================================================================ # PHASE 1: Pre-filter files (mtime / content-hash), version changed # ================================================================ @@ -713,7 +707,6 @@ async def index_local_folder( file_path_abs, file_info["relative_path"], search_space_id, - vision_llm=vision_llm, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -750,7 +743,6 @@ async def index_local_folder( file_path_abs, file_info["relative_path"], search_space_id, - vision_llm=vision_llm, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -1092,13 +1084,9 @@ async def _index_single_file( except PageLimitExceededError as e: return 0, 1, f"Page limit exceeded: {e}" - from app.services.llm_service import get_vision_llm - - vision_llm = await get_vision_llm(session, search_space_id) - try: content, content_hash = await _compute_file_content_hash( - str(full_path), full_path.name, search_space_id, vision_llm=vision_llm + str(full_path), full_path.name, search_space_id ) except Exception as e: return 0, 1, f"Could not read file: {e}" @@ -1316,10 +1304,6 @@ async def index_uploaded_files( pipeline = IndexingPipelineService(session) llm = await get_user_long_context_llm(session, user_id, search_space_id) - from app.services.llm_service import get_vision_llm - - vision_llm = await get_vision_llm(session, search_space_id) - indexed_count = 0 failed_count = 0 errors: list[str] = [] @@ -1367,7 +1351,7 @@ async def index_uploaded_files( try: content, content_hash = await _compute_file_content_hash( - temp_path, filename, search_space_id, vision_llm=vision_llm + temp_path, filename, search_space_id ) except Exception as e: logger.warning(f"Could not read {relative_path}: {e}") From a95bf58c8f4e5ae593ca0eb49354c9668b8c3a51 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 10 Apr 2026 16:45:51 +0200 Subject: [PATCH 25/29] Make Vision LLM opt-in for uploads and connectors --- ...121_add_enable_vision_llm_to_connectors.py | 45 +++++++++++++++++++ .../connectors/dropbox/content_extractor.py | 4 +- .../google_drive/content_extractor.py | 12 +++-- .../connectors/onedrive/content_extractor.py | 12 +++-- surfsense_backend/app/db.py | 7 +++ .../app/routes/documents_routes.py | 4 ++ .../app/schemas/search_source_connector.py | 2 + .../app/services/task_dispatcher.py | 3 ++ .../app/tasks/celery_tasks/document_tasks.py | 8 ++++ .../connector_indexers/dropbox_indexer.py | 21 ++++++++- .../google_drive_indexer.py | 34 +++++++++++++- .../local_folder_indexer.py | 18 ++++++-- .../connector_indexers/onedrive_indexer.py | 21 ++++++++- .../document_processors/file_processors.py | 10 ++++- .../integration/document_upload/conftest.py | 2 + .../assistant-ui/connector-popup.tsx | 6 +++ .../components/vision-llm-config.tsx | 25 +++++++++++ .../views/connector-edit-view.tsx | 13 ++++++ .../views/indexing-configuration-view.tsx | 13 ++++++ .../hooks/use-connector-dialog.ts | 14 +++++- .../components/sources/DocumentUploadTab.tsx | 13 ++++++ .../contracts/types/connector.types.ts | 3 ++ .../contracts/types/document.types.ts | 1 + .../lib/apis/documents-api.service.ts | 5 ++- 24 files changed, 276 insertions(+), 20 deletions(-) create mode 100644 surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py create mode 100644 surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx diff --git a/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py b/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py new file mode 100644 index 000000000..659545645 --- /dev/null +++ b/surfsense_backend/alembic/versions/121_add_enable_vision_llm_to_connectors.py @@ -0,0 +1,45 @@ +"""121_add_enable_vision_llm_to_connectors + +Revision ID: 121 +Revises: 120 +Create Date: 2026-04-09 + +Adds enable_vision_llm boolean column to search_source_connectors. +Defaults to False so vision LLM image processing is opt-in. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "121" +down_revision: str | None = "120" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + conn = op.get_bind() + existing_columns = [ + col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors") + ] + + if "enable_vision_llm" not in existing_columns: + op.add_column( + "search_source_connectors", + sa.Column( + "enable_vision_llm", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + +def downgrade() -> None: + op.drop_column("search_source_connectors", "enable_vision_llm") diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index 8cbc3e417..372d2fc82 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -44,6 +44,8 @@ async def _export_paper_content( async def download_and_extract_content( client: DropboxClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a Dropbox file and extract its content as markdown. @@ -91,7 +93,7 @@ async def download_and_extract_content( from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=temp_file_path, filename=file_name) ) markdown = result.markdown_content diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 83ff32e82..86c789b97 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -27,6 +27,8 @@ logger = logging.getLogger(__name__) async def download_and_extract_content( client: GoogleDriveClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a Google Drive file and extract its content as markdown. @@ -103,7 +105,9 @@ async def download_and_extract_content( etl_filename = ( file_name + extension if is_google_workspace_file(mime_type) else file_name ) - markdown = await _parse_file_to_markdown(temp_file_path, etl_filename) + markdown = await _parse_file_to_markdown( + temp_file_path, etl_filename, vision_llm=vision_llm + ) return markdown, drive_metadata, None except Exception as e: @@ -115,12 +119,14 @@ async def download_and_extract_content( os.unlink(temp_file_path) -async def _parse_file_to_markdown(file_path: str, filename: str) -> str: +async def _parse_file_to_markdown( + file_path: str, filename: str, *, vision_llm=None +) -> str: """Parse a local file to markdown using the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py index 2238b8603..3154f2eca 100644 --- a/surfsense_backend/app/connectors/onedrive/content_extractor.py +++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py @@ -16,6 +16,8 @@ logger = logging.getLogger(__name__) async def download_and_extract_content( client: OneDriveClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a OneDrive file and extract its content as markdown. @@ -65,7 +67,9 @@ async def download_and_extract_content( if error: return None, metadata, error - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + markdown = await _parse_file_to_markdown( + temp_file_path, file_name, vision_llm=vision_llm + ) return markdown, metadata, None except Exception as e: @@ -77,12 +81,14 @@ async def download_and_extract_content( os.unlink(temp_file_path) -async def _parse_file_to_markdown(file_path: str, filename: str) -> str: +async def _parse_file_to_markdown( + file_path: str, filename: str, *, vision_llm=None +) -> str: """Parse a local file to markdown using the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 01a6bbda0..cbcb5efa5 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1555,6 +1555,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin): Boolean, nullable=False, default=False, server_default="false" ) + # Vision LLM for image files - disabled by default to save cost/time. + # When enabled, images are described via a vision language model instead + # of falling back to the document parser. + enable_vision_llm = Column( + Boolean, nullable=False, default=False, server_default="false" + ) + # Periodic indexing fields periodic_indexing_enabled = Column(Boolean, nullable=False, default=False) indexing_frequency_minutes = Column(Integer, nullable=True) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 25841a107..aa7f98294 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -123,6 +123,7 @@ async def create_documents_file_upload( files: list[UploadFile], search_space_id: int = Form(...), should_summarize: bool = Form(False), + use_vision_llm: bool = Form(False), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), dispatcher: TaskDispatcher = Depends(get_task_dispatcher), @@ -272,6 +273,7 @@ async def create_documents_file_upload( search_space_id=search_space_id, user_id=str(user.id), should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) return { @@ -1490,6 +1492,7 @@ async def folder_upload( relative_paths: str = Form(...), root_folder_id: int | None = Form(None), enable_summary: bool = Form(False), + use_vision_llm: bool = Form(False), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -1616,6 +1619,7 @@ async def folder_upload( folder_name=folder_name, root_folder_id=root_folder_id, enable_summary=enable_summary, + use_vision_llm=use_vision_llm, file_mappings=list(file_mappings), ) diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 1b0ed0b13..aac7b92d5 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel): last_indexed_at: datetime | None = None config: dict[str, Any] enable_summary: bool = False + enable_vision_llm: bool = False periodic_indexing_enabled: bool = False indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None @@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel): last_indexed_at: datetime | None = None config: dict[str, Any] | None = None enable_summary: bool | None = None + enable_vision_llm: bool | None = None periodic_indexing_enabled: bool | None = None indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None diff --git a/surfsense_backend/app/services/task_dispatcher.py b/surfsense_backend/app/services/task_dispatcher.py index 9a6fc7d63..7bb70b406 100644 --- a/surfsense_backend/app/services/task_dispatcher.py +++ b/surfsense_backend/app/services/task_dispatcher.py @@ -19,6 +19,7 @@ class TaskDispatcher(Protocol): search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: ... @@ -34,6 +35,7 @@ class CeleryTaskDispatcher: search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: from app.tasks.celery_tasks.document_tasks import ( process_file_upload_with_document_task, @@ -46,6 +48,7 @@ class CeleryTaskDispatcher: search_space_id=search_space_id, user_id=user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 62720826f..fc946b4bc 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -778,6 +778,7 @@ def process_file_upload_with_document_task( search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ): """ Celery task to process uploaded file with existing pending document. @@ -833,6 +834,7 @@ def process_file_upload_with_document_task( search_space_id, user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) ) logger.info( @@ -869,6 +871,7 @@ async def _process_file_with_document( search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ): """ Process file and update existing pending document status. @@ -971,6 +974,7 @@ async def _process_file_with_document( log_entry=log_entry, notification=notification, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) # Update notification on success @@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task( root_folder_id: int, enable_summary: bool, file_mappings: list[dict], + use_vision_llm: bool = False, ): """Celery task to index files uploaded from the desktop app.""" loop = asyncio.new_event_loop() @@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task( root_folder_id=root_folder_id, enable_summary=enable_summary, file_mappings=file_mappings, + use_vision_llm=use_vision_llm, ) ) finally: @@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async( root_folder_id: int, enable_summary: bool, file_mappings: list[dict], + use_vision_llm: bool = False, ): """Run upload-based folder indexing with notification + heartbeat.""" file_count = len(file_mappings) @@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async( enable_summary=enable_summary, file_mappings=file_mappings, on_heartbeat_callback=_heartbeat_progress, + use_vision_llm=use_vision_llm, ) if notification: diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py index 4a49944c2..9f8c1a33a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py @@ -164,6 +164,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel. Returns (docs, failed_count).""" results: list[ConnectorDocument] = [] @@ -176,7 +177,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, db_metadata, error = await download_and_extract_content( - dropbox_client, file + dropbox_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -224,6 +225,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Parallel download then parallel indexing. Returns (batch_indexed, total_failed).""" connector_docs, download_failed = await _download_files_parallel( @@ -234,6 +236,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -287,6 +290,7 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int, str]: """Delta sync using Dropbox cursor-based change tracking. @@ -359,6 +363,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) indexed = renamed_count + batch_indexed @@ -384,6 +389,7 @@ async def _index_full_scan( incremental_sync: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -469,6 +475,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -498,6 +505,7 @@ async def _index_selected_files( enable_summary: bool, incremental_sync: bool = True, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) @@ -557,6 +565,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -621,6 +630,13 @@ async def index_dropbox_files( return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + dropbox_client = DropboxClient(session, connector_id) indexing_options = items_dict.get("indexing_options", {}) @@ -650,6 +666,7 @@ async def index_dropbox_files( user_id=user_id, enable_summary=connector_enable_summary, incremental_sync=incremental_sync, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -684,6 +701,7 @@ async def index_dropbox_files( log_entry, max_files, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) folder_cursors[folder_path] = new_cursor total_unsupported += unsup @@ -703,6 +721,7 @@ async def index_dropbox_files( include_subfolders, incremental_sync=incremental_sync, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_unsupported += unsup diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index b11087fe6..d8f95da63 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -261,6 +261,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel, returning ConnectorDocuments. @@ -276,7 +277,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, drive_metadata, error = await download_and_extract_content( - drive_client, file + drive_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -322,6 +323,7 @@ async def _process_single_file( search_space_id: int, user_id: str, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Download, extract, and index a single Drive file via the pipeline. @@ -343,7 +345,7 @@ async def _process_single_file( await page_limit_service.check_page_limit(user_id, estimated_pages) markdown, drive_metadata, error = await download_and_extract_content( - drive_client, file + drive_client, file, vision_llm=vision_llm ) if error or not markdown: logger.warning(f"ETL failed for {file_name}: {error}") @@ -433,6 +435,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Phase 2+3: parallel download then parallel indexing. @@ -446,6 +449,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -476,6 +480,7 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline. @@ -540,6 +545,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -573,6 +579,7 @@ async def _index_full_scan( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -703,6 +710,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -736,6 +744,7 @@ async def _index_with_delta_sync( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Delta sync using change tracking. @@ -844,6 +853,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -947,6 +957,11 @@ async def index_google_drive_files( ) connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -986,6 +1001,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) documents_unsupported += du logger.info("Running reconciliation scan after delta sync") @@ -1004,6 +1020,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) documents_indexed += ri documents_skipped += rs @@ -1029,6 +1046,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) if documents_indexed > 0 or can_use_delta: @@ -1146,6 +1164,11 @@ async def index_google_drive_single_file( ) connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -1168,6 +1191,7 @@ async def index_google_drive_single_file( search_space_id, user_id, connector_enable_summary, + vision_llm=vision_llm, ) await session.commit() @@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files( return 0, 0, [error_msg] connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files( user_id=user_id, enable_summary=connector_enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if unsupported > 0: diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index a531916e1..2d5f9648d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -153,7 +153,7 @@ def scan_folder( return files -async def _read_file_content(file_path: str, filename: str) -> str: +async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str: """Read file content via the unified ETL pipeline. All file types (plaintext, audio, direct-convert, document, image) are @@ -162,7 +162,7 @@ async def _read_file_content(file_path: str, filename: str) -> str: from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content @@ -199,12 +199,14 @@ async def _compute_file_content_hash( file_path: str, filename: str, search_space_id: int, + *, + vision_llm=None, ) -> tuple[str, str]: """Read a file (via ETL if needed) and compute its content hash. Returns (content_text, content_hash). """ - content = await _read_file_content(file_path, filename) + content = await _read_file_content(file_path, filename, vision_llm=vision_llm) return content, _content_hash(content, search_space_id) @@ -1268,6 +1270,7 @@ async def index_uploaded_files( enable_summary: bool, file_mappings: list[dict], on_heartbeat_callback: HeartbeatCallbackType | None = None, + use_vision_llm: bool = False, ) -> tuple[int, int, str | None]: """Index files uploaded from the desktop app via temp paths. @@ -1304,6 +1307,12 @@ async def index_uploaded_files( pipeline = IndexingPipelineService(session) llm = await get_user_long_context_llm(session, user_id, search_space_id) + vision_llm_instance = None + if use_vision_llm: + from app.services.llm_service import get_vision_llm + + vision_llm_instance = await get_vision_llm(session, search_space_id) + indexed_count = 0 failed_count = 0 errors: list[str] = [] @@ -1351,7 +1360,8 @@ async def index_uploaded_files( try: content, content_hash = await _compute_file_content_hash( - temp_path, filename, search_space_id + temp_path, filename, search_space_id, + vision_llm=vision_llm_instance, ) except Exception as e: logger.warning(f"Could not read {relative_path}: {e}") diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py index 06517f542..aa654a9a9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py @@ -171,6 +171,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel. Returns (docs, failed_count).""" results: list[ConnectorDocument] = [] @@ -183,7 +184,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, od_metadata, error = await download_and_extract_content( - onedrive_client, file + onedrive_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -231,6 +232,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Parallel download then parallel indexing. Returns (batch_indexed, total_failed).""" connector_docs, download_failed = await _download_files_parallel( @@ -241,6 +243,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -293,6 +296,7 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) @@ -343,6 +347,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -375,6 +380,7 @@ async def _index_full_scan( include_subfolders: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -450,6 +456,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -481,6 +488,7 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int, str | None]: """Delta sync using OneDrive change tracking. @@ -573,6 +581,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -643,6 +652,12 @@ async def index_onedrive_files( return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) + onedrive_client = OneDriveClient(session, connector_id) indexing_options = items_dict.get("indexing_options", {}) @@ -666,6 +681,7 @@ async def index_onedrive_files( search_space_id=search_space_id, user_id=user_id, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -695,6 +711,7 @@ async def index_onedrive_files( log_entry, max_files, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -721,6 +738,7 @@ async def index_onedrive_files( max_files, include_subfolders, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += ri total_skipped += rs @@ -740,6 +758,7 @@ async def index_onedrive_files( max_files, include_subfolders, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index cd06657dc..9364fa1cb 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -46,6 +46,7 @@ class _ProcessingContext: log_entry: Log connector: dict | None = None notification: Notification | None = None + use_vision_llm: bool = False enable_summary: bool = field(init=False) def __post_init__(self) -> None: @@ -134,7 +135,7 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No ) vision_llm = None - if etl_classify(ctx.filename) == FileCategory.IMAGE: + if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE: from app.services.llm_service import get_vision_llm vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id) @@ -288,6 +289,7 @@ async def process_file_in_background( log_entry: Log, connector: dict | None = None, notification: Notification | None = None, + use_vision_llm: bool = False, ) -> Document | None: ctx = _ProcessingContext( session=session, @@ -299,6 +301,7 @@ async def process_file_in_background( log_entry=log_entry, connector=connector, notification=notification, + use_vision_llm=use_vision_llm, ) try: @@ -349,6 +352,7 @@ async def _extract_file_content( task_logger: TaskLoggingService, log_entry: Log, notification: Notification | None, + use_vision_llm: bool = False, ) -> tuple[str, str]: """ Extract markdown content from a file regardless of type. @@ -396,7 +400,7 @@ async def _extract_file_content( await page_limit_service.check_page_limit(user_id, estimated_pages) vision_llm = None - if category == FileCategory.IMAGE: + if use_vision_llm and category == FileCategory.IMAGE: from app.services.llm_service import get_vision_llm vision_llm = await get_vision_llm(session, search_space_id) @@ -435,6 +439,7 @@ async def process_file_in_background_with_document( connector: dict | None = None, notification: Notification | None = None, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> Document | None: """ Process file and update existing pending document (2-phase pattern). @@ -463,6 +468,7 @@ async def process_file_in_background_with_document( task_logger, log_entry, notification, + use_vision_llm=use_vision_llm, ) if not markdown_content: diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 62f4f6b47..f35d2e605 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -69,6 +69,7 @@ class InlineTaskDispatcher: search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: from app.tasks.celery_tasks.document_tasks import ( _process_file_with_document, @@ -82,6 +83,7 @@ class InlineTaskDispatcher: search_space_id, user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index c41e986d4..84361e25b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef { startIndexing(editingConnector.id); handleSaveConnector(() => refreshConnectors()); @@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef { if (indexingConfig.connectorId) { diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx new file mode 100644 index 000000000..e5ebdbd06 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx @@ -0,0 +1,25 @@ +"use client"; + +import type { FC } from "react"; +import { Switch } from "@/components/ui/switch"; + +interface VisionLLMConfigProps { + enabled: boolean; + onEnabledChange: (enabled: boolean) => void; +} + +export const VisionLLMConfig: FC = ({ enabled, onEnabledChange }) => { + return ( +
+
+
+

Enable Vision LLM

+

+ Describes images using AI vision (costly, slower) +

+
+ +
+
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 7308e1e26..bea5d12e8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -15,6 +15,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { SummaryConfig } from "../../components/summary-config"; +import { VisionLLMConfig } from "../../components/vision-llm-config"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -38,6 +39,7 @@ interface ConnectorEditViewProps { periodicEnabled: boolean; frequencyMinutes: string; enableSummary: boolean; + enableVisionLlm: boolean; isSaving: boolean; isDisconnecting: boolean; isIndexing?: boolean; @@ -47,6 +49,7 @@ interface ConnectorEditViewProps { onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; onEnableSummaryChange: (enabled: boolean) => void; + onEnableVisionLlmChange: (enabled: boolean) => void; onSave: () => void; onDisconnect: () => void; onBack: () => void; @@ -62,6 +65,7 @@ export const ConnectorEditView: FC = ({ periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, isSaving, isDisconnecting, isIndexing = false, @@ -71,6 +75,7 @@ export const ConnectorEditView: FC = ({ onPeriodicEnabledChange, onFrequencyChange, onEnableSummaryChange, + onEnableVisionLlmChange, onSave, onDisconnect, onBack, @@ -272,6 +277,14 @@ export const ConnectorEditView: FC = ({ {/* AI Summary toggle */} + {/* Vision LLM toggle - only for file-based connectors */} + {(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || + connector.connector_type === "DROPBOX_CONNECTOR" || + connector.connector_type === "ONEDRIVE_CONNECTOR") && ( + + )} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index e583cbe17..cb7438cde 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -10,6 +10,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { SummaryConfig } from "../../components/summary-config"; +import { VisionLLMConfig } from "../../components/vision-llm-config"; import type { IndexingConfigState } from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps { periodicEnabled: boolean; frequencyMinutes: string; enableSummary: boolean; + enableVisionLlm: boolean; isStartingIndexing: boolean; isFromOAuth?: boolean; onStartDateChange: (date: Date | undefined) => void; @@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps { onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; onEnableSummaryChange: (enabled: boolean) => void; + onEnableVisionLlmChange: (enabled: boolean) => void; onConfigChange?: (config: Record) => void; onStartIndexing: () => void; onSkip: () => void; @@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC = ({ periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, isStartingIndexing, isFromOAuth = false, onStartDateChange, @@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC = ({ onPeriodicEnabledChange, onFrequencyChange, onEnableSummaryChange, + onEnableVisionLlmChange, onConfigChange, onStartIndexing, onSkip, @@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC = ({ {/* AI Summary toggle */} + {/* Vision LLM toggle - only for file-based connectors */} + {(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" || + config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || + config.connectorType === "DROPBOX_CONNECTOR" || + config.connectorType === "ONEDRIVE_CONNECTOR") && ( + + )} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 6543bbd72..7331549b5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -80,6 +80,7 @@ export const useConnectorDialog = () => { const [periodicEnabled, setPeriodicEnabled] = useState(false); const [frequencyMinutes, setFrequencyMinutes] = useState("1440"); const [enableSummary, setEnableSummary] = useState(false); + const [enableVisionLlm, setEnableVisionLlm] = useState(false); // Edit mode state const [editingConnector, setEditingConnector] = useState(null); @@ -621,6 +622,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(false); setFrequencyMinutes("1440"); setEnableSummary(connector.enable_summary ?? false); + setEnableVisionLlm(connector.enable_vision_llm ?? false); setStartDate(undefined); setEndDate(undefined); @@ -763,12 +765,13 @@ export const useConnectorDialog = () => { const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; // Update connector with summary, periodic sync settings, and config changes - if (enableSummary || periodicEnabled || indexingConnectorConfig) { - const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; + if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) { + const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; await updateConnector({ id: indexingConfig.connectorId, data: { enable_summary: enableSummary, + enable_vision_llm: enableVisionLlm, ...(periodicEnabled && { periodic_indexing_enabled: true, indexing_frequency_minutes: frequency, @@ -896,6 +899,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, indexingConnectorConfig, setIsOpen, ] @@ -960,6 +964,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); setEnableSummary(connector.enable_summary ?? false); + setEnableVisionLlm(connector.enable_vision_llm ?? false); setStartDate(undefined); setEndDate(undefined); }, @@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => { data: { name: connectorName || editingConnector.name, enable_summary: enableSummary, + enable_vision_llm: enableVisionLlm, periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled, indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency, config: connectorConfig || editingConnector.config, @@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, getFrequencyLabel, connectorConfig, connectorName, @@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(false); setFrequencyMinutes("1440"); setEnableSummary(false); + setEnableVisionLlm(false); } } }, @@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, searchSpaceId, allConnectors, viewingAccountsType, @@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled, setFrequencyMinutes, setEnableSummary, + setEnableVisionLlm, setConnectorName, // Handlers diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 117d376ec..e7f4451b8 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -136,6 +136,7 @@ export function DocumentUploadTab({ const [uploadProgress, setUploadProgress] = useState(0); const [accordionValue, setAccordionValue] = useState(""); const [shouldSummarize, setShouldSummarize] = useState(false); + const [useVisionLlm, setUseVisionLlm] = useState(false); const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); @@ -361,6 +362,7 @@ export function DocumentUploadTab({ relative_paths: batch.map((e) => e.relativePath), root_folder_id: rootFolderId, enable_summary: shouldSummarize, + use_vision_llm: useVisionLlm, } ); @@ -407,6 +409,7 @@ export function DocumentUploadTab({ files: rawFiles, search_space_id: Number(searchSpaceId), should_summarize: shouldSummarize, + use_vision_llm: useVisionLlm, }, { onSuccess: () => { @@ -696,6 +699,16 @@ export function DocumentUploadTab({ +
+
+

Enable Vision LLM

+

+ Describes images using AI vision (costly, slower) +

+
+ +
+