diff --git a/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py new file mode 100644 index 000000000..353e0680e --- /dev/null +++ b/surfsense_backend/alembic/versions/123_add_enable_vision_llm_to_connectors.py @@ -0,0 +1,45 @@ +"""123_add_enable_vision_llm_to_connectors + +Revision ID: 123 +Revises: 122 +Create Date: 2026-04-09 + +Adds enable_vision_llm boolean column to search_source_connectors. +Defaults to False so vision LLM image processing is opt-in. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "123" +down_revision: str | None = "122" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + conn = op.get_bind() + existing_columns = [ + col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors") + ] + + if "enable_vision_llm" not in existing_columns: + op.add_column( + "search_source_connectors", + sa.Column( + "enable_vision_llm", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + +def downgrade() -> None: + op.drop_column("search_source_connectors", "enable_vision_llm") diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index 8cbc3e417..372d2fc82 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -44,6 +44,8 @@ async def _export_paper_content( async def download_and_extract_content( client: DropboxClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a Dropbox file and extract its content as markdown. @@ -91,7 +93,7 @@ async def download_and_extract_content( from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=temp_file_path, filename=file_name) ) markdown = result.markdown_content diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 83ff32e82..86c789b97 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -27,6 +27,8 @@ logger = logging.getLogger(__name__) async def download_and_extract_content( client: GoogleDriveClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a Google Drive file and extract its content as markdown. @@ -103,7 +105,9 @@ async def download_and_extract_content( etl_filename = ( file_name + extension if is_google_workspace_file(mime_type) else file_name ) - markdown = await _parse_file_to_markdown(temp_file_path, etl_filename) + markdown = await _parse_file_to_markdown( + temp_file_path, etl_filename, vision_llm=vision_llm + ) return markdown, drive_metadata, None except Exception as e: @@ -115,12 +119,14 @@ async def download_and_extract_content( os.unlink(temp_file_path) -async def _parse_file_to_markdown(file_path: str, filename: str) -> str: +async def _parse_file_to_markdown( + file_path: str, filename: str, *, vision_llm=None +) -> str: """Parse a local file to markdown using the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py index 2238b8603..3154f2eca 100644 --- a/surfsense_backend/app/connectors/onedrive/content_extractor.py +++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py @@ -16,6 +16,8 @@ logger = logging.getLogger(__name__) async def download_and_extract_content( client: OneDriveClient, file: dict[str, Any], + *, + vision_llm=None, ) -> tuple[str | None, dict[str, Any], str | None]: """Download a OneDrive file and extract its content as markdown. @@ -65,7 +67,9 @@ async def download_and_extract_content( if error: return None, metadata, error - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + markdown = await _parse_file_to_markdown( + temp_file_path, file_name, vision_llm=vision_llm + ) return markdown, metadata, None except Exception as e: @@ -77,12 +81,14 @@ async def download_and_extract_content( os.unlink(temp_file_path) -async def _parse_file_to_markdown(file_path: str, filename: str) -> str: +async def _parse_file_to_markdown( + file_path: str, filename: str, *, vision_llm=None +) -> str: """Parse a local file to markdown using the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 2de6ab572..e69d28ac2 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1450,6 +1450,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin): Boolean, nullable=False, default=False, server_default="false" ) + # Vision LLM for image files - disabled by default to save cost/time. + # When enabled, images are described via a vision language model instead + # of falling back to the document parser. + enable_vision_llm = Column( + Boolean, nullable=False, default=False, server_default="false" + ) + # Periodic indexing fields periodic_indexing_enabled = Column(Boolean, nullable=False, default=False) indexing_frequency_minutes = Column(Integer, nullable=True) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index fbd2e4e73..b4438ce4d 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext class EtlPipelineService: """Single pipeline for extracting markdown from files. All callers use this.""" + def __init__(self, *, vision_llm=None): + self._vision_llm = vision_llm + async def extract(self, request: EtlRequest) -> EtlResult: category = classify_file(request.filename) @@ -47,8 +50,45 @@ class EtlPipelineService: content_type="audio", ) + if category == FileCategory.IMAGE: + return await self._extract_image(request) + return await self._extract_document(request) + async def _extract_image(self, request: EtlRequest) -> EtlResult: + if self._vision_llm: + try: + from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm + + content = await parse_with_vision_llm( + request.file_path, request.filename, self._vision_llm + ) + return EtlResult( + markdown_content=content, + etl_service="VISION_LLM", + content_type="image", + ) + except Exception: + logging.warning( + "Vision LLM failed for %s, falling back to document parser", + request.filename, + exc_info=True, + ) + else: + logging.info( + "No vision LLM provided, falling back to document parser for %s", + request.filename, + ) + + try: + return await self._extract_document(request) + except (EtlUnsupportedFileError, EtlServiceUnavailableError): + raise EtlUnsupportedFileError( + f"Cannot process image {request.filename}: vision LLM " + f"{'failed' if self._vision_llm else 'not configured'} and " + f"document parser does not support this format" + ) from None + async def _extract_document(self, request: EtlRequest) -> EtlResult: from pathlib import PurePosixPath diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py index 4e690bcdc..120369a27 100644 --- a/surfsense_backend/app/etl_pipeline/file_classifier.py +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -3,6 +3,7 @@ from pathlib import PurePosixPath from app.utils.file_extensions import ( DOCUMENT_EXTENSIONS, + IMAGE_EXTENSIONS, get_document_extensions_for_service, ) @@ -105,6 +106,7 @@ class FileCategory(Enum): PLAINTEXT = "plaintext" AUDIO = "audio" DIRECT_CONVERT = "direct_convert" + IMAGE = "image" UNSUPPORTED = "unsupported" DOCUMENT = "document" @@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory: return FileCategory.AUDIO if suffix in DIRECT_CONVERT_EXTENSIONS: return FileCategory.DIRECT_CONVERT + if suffix in IMAGE_EXTENSIONS: + return FileCategory.IMAGE if suffix in DOCUMENT_EXTENSIONS: return FileCategory.DOCUMENT return FileCategory.UNSUPPORTED @@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool: """Return True if *filename* cannot be processed by *etl_service*. Plaintext, audio, and direct-convert files are parser-agnostic and never - skipped. Document files are checked against the per-parser extension set. + skipped. Image and document files are checked against the per-parser + extension set (images fall back to the document parser when no vision LLM + is available, so the same service constraint applies). """ category = classify_file(filename) if category == FileCategory.UNSUPPORTED: return True - if category == FileCategory.DOCUMENT: + if category in (FileCategory.DOCUMENT, FileCategory.IMAGE): suffix = PurePosixPath(filename).suffix.lower() return suffix not in get_document_extensions_for_service(etl_service) return False diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py new file mode 100644 index 000000000..c80fbca0a --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -0,0 +1,64 @@ +import asyncio +import base64 +import os + +from langchain_core.messages import HumanMessage + +_PROMPT = ( + "Describe this image in markdown. " + "Transcribe any visible text verbatim. " + "Be concise but complete — let the image content guide the level of detail." +) + +_MAX_IMAGE_BYTES = ( + 5 * 1024 * 1024 +) # 5 MB (Anthropic Claude's limit, the most restrictive) + +_INVOKE_TIMEOUT_SECONDS = 120 + +_EXT_TO_MIME: dict[str, str] = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".bmp": "image/bmp", + ".tiff": "image/tiff", + ".tif": "image/tiff", + ".webp": "image/webp", + ".svg": "image/svg+xml", + ".heic": "image/heic", + ".heif": "image/heif", +} + + +def _image_to_data_url(file_path: str) -> str: + file_size = os.path.getsize(file_path) + if file_size > _MAX_IMAGE_BYTES: + raise ValueError( + f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, " + f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}" + ) + ext = os.path.splitext(file_path)[1].lower() + mime_type = _EXT_TO_MIME.get(ext) + if not mime_type: + raise ValueError(f"Unsupported image extension {ext!r}: {file_path}") + with open(file_path, "rb") as f: + encoded = base64.b64encode(f.read()).decode("ascii") + return f"data:{mime_type};base64,{encoded}" + + +async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str: + data_url = _image_to_data_url(file_path) + message = HumanMessage( + content=[ + {"type": "text", "text": _PROMPT}, + {"type": "image_url", "image_url": {"url": data_url}}, + ] + ) + response = await asyncio.wait_for( + llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS + ) + text = response.content if hasattr(response, "content") else str(response) + if not text or not text.strip(): + raise ValueError(f"Vision LLM returned empty content for {filename}") + return text.strip() diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 5e3c84c8c..ad40666cd 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router from .documents_routes import router as documents_router from .dropbox_add_connector_route import router as dropbox_add_connector_router from .editor_routes import router as editor_router +from .export_routes import router as export_router from .folders_routes import router as folders_router from .google_calendar_add_connector_route import ( router as google_calendar_add_connector_router, @@ -58,6 +59,7 @@ router = APIRouter() router.include_router(search_spaces_router) router.include_router(rbac_router) # RBAC routes for roles, members, invites router.include_router(editor_router) +router.include_router(export_router) router.include_router(documents_router) router.include_router(folders_router) router.include_router(notes_router) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 53312c647..aa7f98294 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -2,7 +2,7 @@ import asyncio from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile -from pydantic import BaseModel as PydanticBaseModel +from pydantic import BaseModel as PydanticBaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -123,6 +123,7 @@ async def create_documents_file_upload( files: list[UploadFile], search_space_id: int = Form(...), should_summarize: bool = Form(False), + use_vision_llm: bool = Form(False), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), dispatcher: TaskDispatcher = Depends(get_task_dispatcher), @@ -272,6 +273,7 @@ async def create_documents_file_upload( search_space_id=search_space_id, user_id=str(user.id), should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) return { @@ -1395,10 +1397,13 @@ class FolderMtimeCheckFile(PydanticBaseModel): mtime: float +_MAX_MTIME_CHECK_FILES = 10_000 + + class FolderMtimeCheckRequest(PydanticBaseModel): folder_name: str search_space_id: int - files: list[FolderMtimeCheckFile] + files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES) class FolderUnlinkRequest(PydanticBaseModel): @@ -1487,6 +1492,7 @@ async def folder_upload( relative_paths: str = Form(...), root_folder_id: int | None = Form(None), enable_summary: bool = Form(False), + use_vision_llm: bool = Form(False), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -1531,6 +1537,23 @@ async def folder_upload( f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.", ) + from app.services.folder_service import MAX_FOLDER_DEPTH + + max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0) + if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH: + raise HTTPException( + status_code=400, + detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels " + f"exceeds the maximum of {MAX_FOLDER_DEPTH}.", + ) + + if root_folder_id: + root_folder = await session.get(Folder, root_folder_id) + if not root_folder or root_folder.search_space_id != search_space_id: + raise HTTPException( + status_code=404, detail="Root folder not found in this search space" + ) + if not root_folder_id: watched_metadata = { "watched": True, @@ -1565,7 +1588,8 @@ async def folder_upload( async def _read_and_save(file: UploadFile, idx: int) -> dict: content = await file.read() - filename = file.filename or rel_paths[idx].split("/")[-1] + raw_name = file.filename or rel_paths[idx] + filename = raw_name.split("/")[-1] def _write_temp() -> str: with tempfile.NamedTemporaryFile( @@ -1595,6 +1619,7 @@ async def folder_upload( folder_name=folder_name, root_folder_id=root_folder_id, enable_summary=enable_summary, + use_vision_llm=use_vision_llm, file_mappings=list(file_mappings), ) diff --git a/surfsense_backend/app/routes/export_routes.py b/surfsense_backend/app/routes/export_routes.py new file mode 100644 index 000000000..641c7fedb --- /dev/null +++ b/surfsense_backend/app/routes/export_routes.py @@ -0,0 +1,61 @@ +"""Routes for exporting knowledge base content as ZIP.""" + +import logging +import os + +from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi.responses import StreamingResponse +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Permission, User, get_async_session +from app.services.export_service import build_export_zip +from app.users import current_active_user +from app.utils.rbac import check_permission + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.get("/search-spaces/{search_space_id}/export") +async def export_knowledge_base( + search_space_id: int, + folder_id: int | None = Query(None, description="Export only this folder's subtree"), + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Export documents as a ZIP of markdown files preserving folder structure.""" + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to export documents in this search space", + ) + + try: + result = await build_export_zip(session, search_space_id, folder_id) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) from None + + def stream_and_cleanup(): + try: + with open(result.zip_path, "rb") as f: + while chunk := f.read(8192): + yield chunk + finally: + os.unlink(result.zip_path) + + headers = { + "Content-Disposition": f'attachment; filename="{result.export_name}.zip"', + "Content-Length": str(result.zip_size), + } + + if result.skipped_docs: + headers["X-Skipped-Documents"] = str(len(result.skipped_docs)) + + return StreamingResponse( + stream_and_cleanup(), + media_type="application/zip", + headers=headers, + ) diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 1b0ed0b13..aac7b92d5 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel): last_indexed_at: datetime | None = None config: dict[str, Any] enable_summary: bool = False + enable_vision_llm: bool = False periodic_indexing_enabled: bool = False indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None @@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel): last_indexed_at: datetime | None = None config: dict[str, Any] | None = None enable_summary: bool | None = None + enable_vision_llm: bool | None = None periodic_indexing_enabled: bool | None = None indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py new file mode 100644 index 000000000..97f952223 --- /dev/null +++ b/surfsense_backend/app/services/export_service.py @@ -0,0 +1,200 @@ +"""Service for exporting knowledge base content as a ZIP archive.""" + +import asyncio +import logging +import os +import tempfile +import zipfile +from dataclasses import dataclass, field + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select + +from app.db import Chunk, Document, Folder +from app.services.folder_service import get_folder_subtree_ids + +logger = logging.getLogger(__name__) + + +def _sanitize_filename(title: str) -> str: + safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip() + return safe[:80] or "document" + + +def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]: + """Build a mapping of folder_id -> full path string (e.g. 'Research/AI').""" + id_to_folder = {f.id: f for f in folders} + cache: dict[int, str] = {} + + def resolve(folder_id: int) -> str: + if folder_id in cache: + return cache[folder_id] + folder = id_to_folder[folder_id] + safe_name = _sanitize_filename(folder.name) + if folder.parent_id is None or folder.parent_id not in id_to_folder: + cache[folder_id] = safe_name + else: + cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}" + return cache[folder_id] + + for f in folders: + resolve(f.id) + + return cache + + +async def _get_document_markdown( + session: AsyncSession, document: Document +) -> str | None: + """Resolve markdown content using the 3-tier fallback: + 1. source_markdown 2. blocknote_document conversion 3. chunk concatenation + """ + if document.source_markdown is not None: + return document.source_markdown + + if document.blocknote_document: + from app.utils.blocknote_to_markdown import blocknote_to_markdown + + md = blocknote_to_markdown(document.blocknote_document) + if md: + return md + + chunk_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document.id) + .order_by(Chunk.id) + ) + chunks = chunk_result.scalars().all() + if chunks: + return "\n\n".join(chunks) + + return None + + +@dataclass +class ExportResult: + zip_path: str + export_name: str + zip_size: int + skipped_docs: list[str] = field(default_factory=list) + + +async def build_export_zip( + session: AsyncSession, + search_space_id: int, + folder_id: int | None = None, +) -> ExportResult: + """Build a ZIP archive of markdown documents preserving folder structure. + + Returns an ExportResult with the path to the temp ZIP file. + The caller is responsible for streaming and cleaning up the file. + + Raises ValueError if folder_id is provided but not found. + """ + if folder_id is not None: + folder = await session.get(Folder, folder_id) + if not folder or folder.search_space_id != search_space_id: + raise ValueError("Folder not found") + target_folder_ids = set(await get_folder_subtree_ids(session, folder_id)) + else: + target_folder_ids = None + + folder_query = select(Folder).where(Folder.search_space_id == search_space_id) + if target_folder_ids is not None: + folder_query = folder_query.where(Folder.id.in_(target_folder_ids)) + folder_result = await session.execute(folder_query) + folders = list(folder_result.scalars().all()) + + folder_path_map = _build_folder_path_map(folders) + + batch_size = 100 + + base_doc_query = select(Document).where(Document.search_space_id == search_space_id) + if target_folder_ids is not None: + base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids)) + base_doc_query = base_doc_query.order_by(Document.id) + + fd, tmp_path = tempfile.mkstemp(suffix=".zip") + os.close(fd) + + used_paths: dict[str, int] = {} + skipped_docs: list[str] = [] + is_first_batch = True + + try: + offset = 0 + while True: + batch_query = base_doc_query.limit(batch_size).offset(offset) + batch_result = await session.execute(batch_query) + documents = list(batch_result.scalars().all()) + if not documents: + break + + entries: list[tuple[str, str]] = [] + + for doc in documents: + status = doc.status or {} + state = ( + status.get("state", "ready") + if isinstance(status, dict) + else "ready" + ) + if state in ("pending", "processing"): + skipped_docs.append(doc.title or "Untitled") + continue + + markdown = await _get_document_markdown(session, doc) + if not markdown or not markdown.strip(): + continue + + if doc.folder_id and doc.folder_id in folder_path_map: + dir_path = folder_path_map[doc.folder_id] + else: + dir_path = "" + + base_name = _sanitize_filename(doc.title or "Untitled") + file_path = ( + f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md" + ) + + if file_path in used_paths: + used_paths[file_path] += 1 + suffix = used_paths[file_path] + file_path = ( + f"{dir_path}/{base_name}_{suffix}.md" + if dir_path + else f"{base_name}_{suffix}.md" + ) + used_paths[file_path] = used_paths.get(file_path, 0) + 1 + + entries.append((file_path, markdown)) + + if entries: + mode = "w" if is_first_batch else "a" + batch_entries = entries + + def _write_batch(m: str = mode, e: list = batch_entries) -> None: + with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf: + for path, content in e: + zf.writestr(path, content) + + await asyncio.to_thread(_write_batch) + is_first_batch = False + + offset += batch_size + + export_name = "knowledge-base" + if folder_id is not None and folder_id in folder_path_map: + export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0]) + + return ExportResult( + zip_path=tmp_path, + export_name=export_name, + zip_size=os.path.getsize(tmp_path), + skipped_docs=skipped_docs, + ) + + except Exception: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + raise diff --git a/surfsense_backend/app/services/task_dispatcher.py b/surfsense_backend/app/services/task_dispatcher.py index 9a6fc7d63..7bb70b406 100644 --- a/surfsense_backend/app/services/task_dispatcher.py +++ b/surfsense_backend/app/services/task_dispatcher.py @@ -19,6 +19,7 @@ class TaskDispatcher(Protocol): search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: ... @@ -34,6 +35,7 @@ class CeleryTaskDispatcher: search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: from app.tasks.celery_tasks.document_tasks import ( process_file_upload_with_document_task, @@ -46,6 +48,7 @@ class CeleryTaskDispatcher: search_space_id=search_space_id, user_id=user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 62720826f..fc946b4bc 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -778,6 +778,7 @@ def process_file_upload_with_document_task( search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ): """ Celery task to process uploaded file with existing pending document. @@ -833,6 +834,7 @@ def process_file_upload_with_document_task( search_space_id, user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) ) logger.info( @@ -869,6 +871,7 @@ async def _process_file_with_document( search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ): """ Process file and update existing pending document status. @@ -971,6 +974,7 @@ async def _process_file_with_document( log_entry=log_entry, notification=notification, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) # Update notification on success @@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task( root_folder_id: int, enable_summary: bool, file_mappings: list[dict], + use_vision_llm: bool = False, ): """Celery task to index files uploaded from the desktop app.""" loop = asyncio.new_event_loop() @@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task( root_folder_id=root_folder_id, enable_summary=enable_summary, file_mappings=file_mappings, + use_vision_llm=use_vision_llm, ) ) finally: @@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async( root_folder_id: int, enable_summary: bool, file_mappings: list[dict], + use_vision_llm: bool = False, ): """Run upload-based folder indexing with notification + heartbeat.""" file_count = len(file_mappings) @@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async( enable_summary=enable_summary, file_mappings=file_mappings, on_heartbeat_callback=_heartbeat_progress, + use_vision_llm=use_vision_llm, ) if notification: diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py index 4a49944c2..9f8c1a33a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py @@ -164,6 +164,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel. Returns (docs, failed_count).""" results: list[ConnectorDocument] = [] @@ -176,7 +177,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, db_metadata, error = await download_and_extract_content( - dropbox_client, file + dropbox_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -224,6 +225,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Parallel download then parallel indexing. Returns (batch_indexed, total_failed).""" connector_docs, download_failed = await _download_files_parallel( @@ -234,6 +236,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -287,6 +290,7 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int, str]: """Delta sync using Dropbox cursor-based change tracking. @@ -359,6 +363,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) indexed = renamed_count + batch_indexed @@ -384,6 +389,7 @@ async def _index_full_scan( incremental_sync: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -469,6 +475,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -498,6 +505,7 @@ async def _index_selected_files( enable_summary: bool, incremental_sync: bool = True, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) @@ -557,6 +565,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -621,6 +630,13 @@ async def index_dropbox_files( return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + dropbox_client = DropboxClient(session, connector_id) indexing_options = items_dict.get("indexing_options", {}) @@ -650,6 +666,7 @@ async def index_dropbox_files( user_id=user_id, enable_summary=connector_enable_summary, incremental_sync=incremental_sync, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -684,6 +701,7 @@ async def index_dropbox_files( log_entry, max_files, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) folder_cursors[folder_path] = new_cursor total_unsupported += unsup @@ -703,6 +721,7 @@ async def index_dropbox_files( include_subfolders, incremental_sync=incremental_sync, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_unsupported += unsup diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index b11087fe6..d8f95da63 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -261,6 +261,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel, returning ConnectorDocuments. @@ -276,7 +277,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, drive_metadata, error = await download_and_extract_content( - drive_client, file + drive_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -322,6 +323,7 @@ async def _process_single_file( search_space_id: int, user_id: str, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Download, extract, and index a single Drive file via the pipeline. @@ -343,7 +345,7 @@ async def _process_single_file( await page_limit_service.check_page_limit(user_id, estimated_pages) markdown, drive_metadata, error = await download_and_extract_content( - drive_client, file + drive_client, file, vision_llm=vision_llm ) if error or not markdown: logger.warning(f"ETL failed for {file_name}: {error}") @@ -433,6 +435,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Phase 2+3: parallel download then parallel indexing. @@ -446,6 +449,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -476,6 +480,7 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline. @@ -540,6 +545,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -573,6 +579,7 @@ async def _index_full_scan( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -703,6 +710,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -736,6 +744,7 @@ async def _index_with_delta_sync( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Delta sync using change tracking. @@ -844,6 +853,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -947,6 +957,11 @@ async def index_google_drive_files( ) connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -986,6 +1001,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) documents_unsupported += du logger.info("Running reconciliation scan after delta sync") @@ -1004,6 +1020,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) documents_indexed += ri documents_skipped += rs @@ -1029,6 +1046,7 @@ async def index_google_drive_files( include_subfolders, on_heartbeat_callback, connector_enable_summary, + vision_llm=vision_llm, ) if documents_indexed > 0 or can_use_delta: @@ -1146,6 +1164,11 @@ async def index_google_drive_single_file( ) connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -1168,6 +1191,7 @@ async def index_google_drive_single_file( search_space_id, user_id, connector_enable_summary, + vision_llm=vision_llm, ) await session.commit() @@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files( return 0, 0, [error_msg] connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) drive_client = GoogleDriveClient( session, connector_id, credentials=pre_built_credentials ) @@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files( user_id=user_id, enable_summary=connector_enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if unsupported > 0: diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index f503ff864..2d5f9648d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -153,16 +153,16 @@ def scan_folder( return files -async def _read_file_content(file_path: str, filename: str) -> str: +async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str: """Read file content via the unified ETL pipeline. - All file types (plaintext, audio, direct-convert, document) are handled - by ``EtlPipelineService``. + All file types (plaintext, audio, direct-convert, document, image) are + handled by ``EtlPipelineService``. """ from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService().extract( + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=file_path, filename=filename) ) return result.markdown_content @@ -199,12 +199,14 @@ async def _compute_file_content_hash( file_path: str, filename: str, search_space_id: int, + *, + vision_llm=None, ) -> tuple[str, str]: """Read a file (via ETL if needed) and compute its content hash. Returns (content_text, content_hash). """ - content = await _read_file_content(file_path, filename) + content = await _read_file_content(file_path, filename, vision_llm=vision_llm) return content, _content_hash(content, search_space_id) @@ -704,7 +706,9 @@ async def index_local_folder( try: content, content_hash = await _compute_file_content_hash( - file_path_abs, file_info["relative_path"], search_space_id + file_path_abs, + file_info["relative_path"], + search_space_id, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -738,7 +742,9 @@ async def index_local_folder( try: content, content_hash = await _compute_file_content_hash( - file_path_abs, file_info["relative_path"], search_space_id + file_path_abs, + file_info["relative_path"], + search_space_id, ) except Exception as read_err: logger.warning(f"Could not read {file_path_abs}: {read_err}") @@ -1264,6 +1270,7 @@ async def index_uploaded_files( enable_summary: bool, file_mappings: list[dict], on_heartbeat_callback: HeartbeatCallbackType | None = None, + use_vision_llm: bool = False, ) -> tuple[int, int, str | None]: """Index files uploaded from the desktop app via temp paths. @@ -1300,6 +1307,12 @@ async def index_uploaded_files( pipeline = IndexingPipelineService(session) llm = await get_user_long_context_llm(session, user_id, search_space_id) + vision_llm_instance = None + if use_vision_llm: + from app.services.llm_service import get_vision_llm + + vision_llm_instance = await get_vision_llm(session, search_space_id) + indexed_count = 0 failed_count = 0 errors: list[str] = [] @@ -1347,7 +1360,8 @@ async def index_uploaded_files( try: content, content_hash = await _compute_file_content_hash( - temp_path, filename, search_space_id + temp_path, filename, search_space_id, + vision_llm=vision_llm_instance, ) except Exception as e: logger.warning(f"Could not read {relative_path}: {e}") diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py index 06517f542..aa654a9a9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py @@ -171,6 +171,7 @@ async def _download_files_parallel( enable_summary: bool, max_concurrency: int = 3, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[list[ConnectorDocument], int]: """Download and ETL files in parallel. Returns (docs, failed_count).""" results: list[ConnectorDocument] = [] @@ -183,7 +184,7 @@ async def _download_files_parallel( nonlocal last_heartbeat, completed_count async with sem: markdown, od_metadata, error = await download_and_extract_content( - onedrive_client, file + onedrive_client, file, vision_llm=vision_llm ) if error or not markdown: file_name = file.get("name", "Unknown") @@ -231,6 +232,7 @@ async def _download_and_index( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int]: """Parallel download then parallel indexing. Returns (batch_indexed, total_failed).""" connector_docs, download_failed = await _download_files_parallel( @@ -241,6 +243,7 @@ async def _download_and_index( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) batch_indexed = 0 @@ -293,6 +296,7 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, + vision_llm=None, ) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) @@ -343,6 +347,7 @@ async def _index_selected_files( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -375,6 +380,7 @@ async def _index_full_scan( include_subfolders: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int]: """Full scan indexing of a folder. @@ -450,6 +456,7 @@ async def _index_full_scan( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -481,6 +488,7 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, + vision_llm=None, ) -> tuple[int, int, int, str | None]: """Delta sync using OneDrive change tracking. @@ -573,6 +581,7 @@ async def _index_with_delta_sync( user_id=user_id, enable_summary=enable_summary, on_heartbeat=on_heartbeat_callback, + vision_llm=vision_llm, ) if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: @@ -643,6 +652,12 @@ async def index_onedrive_files( return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) + connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False) + vision_llm = None + if connector_enable_vision_llm: + from app.services.llm_service import get_vision_llm + vision_llm = await get_vision_llm(session, search_space_id) + onedrive_client = OneDriveClient(session, connector_id) indexing_options = items_dict.get("indexing_options", {}) @@ -666,6 +681,7 @@ async def index_onedrive_files( search_space_id=search_space_id, user_id=user_id, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -695,6 +711,7 @@ async def index_onedrive_files( log_entry, max_files, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped @@ -721,6 +738,7 @@ async def index_onedrive_files( max_files, include_subfolders, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += ri total_skipped += rs @@ -740,6 +758,7 @@ async def index_onedrive_files( max_files, include_subfolders, enable_summary=connector_enable_summary, + vision_llm=vision_llm, ) total_indexed += indexed total_skipped += skipped diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index c765dbd87..9364fa1cb 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -46,6 +46,7 @@ class _ProcessingContext: log_entry: Log connector: dict | None = None notification: Notification | None = None + use_vision_llm: bool = False enable_summary: bool = field(init=False) def __post_init__(self) -> None: @@ -118,9 +119,13 @@ async def _log_page_divergence( async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None: - """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline.""" + """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline.""" from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + from app.etl_pipeline.file_classifier import ( + FileCategory, + classify_file as etl_classify, + ) await _notify(ctx, "parsing", "Processing file") await ctx.task_logger.log_task_progress( @@ -129,7 +134,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No {"processing_stage": "extracting"}, ) - etl_result = await EtlPipelineService().extract( + vision_llm = None + if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id) + + etl_result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest(file_path=ctx.file_path, filename=ctx.filename) ) @@ -278,6 +289,7 @@ async def process_file_in_background( log_entry: Log, connector: dict | None = None, notification: Notification | None = None, + use_vision_llm: bool = False, ) -> Document | None: ctx = _ProcessingContext( session=session, @@ -289,6 +301,7 @@ async def process_file_in_background( log_entry=log_entry, connector=connector, notification=notification, + use_vision_llm=use_vision_llm, ) try: @@ -333,11 +346,13 @@ async def process_file_in_background( async def _extract_file_content( file_path: str, filename: str, + search_space_id: int, session: AsyncSession, user_id: str, task_logger: TaskLoggingService, log_entry: Log, notification: Notification | None, + use_vision_llm: bool = False, ) -> tuple[str, str]: """ Extract markdown content from a file regardless of type. @@ -360,6 +375,7 @@ async def _extract_file_content( FileCategory.PLAINTEXT: "Reading file", FileCategory.DIRECT_CONVERT: "Converting file", FileCategory.AUDIO: "Transcribing audio", + FileCategory.IMAGE: "Analyzing image", FileCategory.UNSUPPORTED: "Unsupported file type", FileCategory.DOCUMENT: "Extracting content", } @@ -383,7 +399,13 @@ async def _extract_file_content( estimated_pages = _estimate_pages_safe(page_limit_service, file_path) await page_limit_service.check_page_limit(user_id, estimated_pages) - result = await EtlPipelineService().extract( + vision_llm = None + if use_vision_llm and category == FileCategory.IMAGE: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest( file_path=file_path, filename=filename, @@ -417,6 +439,7 @@ async def process_file_in_background_with_document( connector: dict | None = None, notification: Notification | None = None, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> Document | None: """ Process file and update existing pending document (2-phase pattern). @@ -439,11 +462,13 @@ async def process_file_in_background_with_document( markdown_content, etl_service = await _extract_file_content( file_path, filename, + search_space_id, session, user_id, task_logger, log_entry, notification, + use_vision_llm=use_vision_llm, ) if not markdown_content: diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index 16ac585b7..e8be1b83a 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these sets are exclusively for the "document" ETL path (Docling / LlamaParse / Unstructured). + +Image extensions intentionally remain in the per-parser sets for fallback +compatibility. IMAGE_EXTENSIONS is used only for routing classification. """ from pathlib import PurePosixPath +# --------------------------------------------------------------------------- +# Image extensions (used by file_classifier for routing to vision LLM) +# --------------------------------------------------------------------------- + +IMAGE_EXTENSIONS: frozenset[str] = frozenset( + { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + } +) + # --------------------------------------------------------------------------- # Per-parser document extension sets (from official documentation) # --------------------------------------------------------------------------- diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 62f4f6b47..f35d2e605 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -69,6 +69,7 @@ class InlineTaskDispatcher: search_space_id: int, user_id: str, should_summarize: bool = False, + use_vision_llm: bool = False, ) -> None: from app.tasks.celery_tasks.document_tasks import ( _process_file_with_document, @@ -82,6 +83,7 @@ class InlineTaskDispatcher: search_space_id, user_id, should_summarize=should_summarize, + use_vision_llm=use_vision_llm, ) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index f72135d05..9ba87207a 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore( active = 0 peak = 0 - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): nonlocal active, peak async with lock: active += 1 @@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads( monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0) - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): await asyncio.sleep(0.05) return _mock_extract_ok(file["id"], file["name"]) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py index 0ae096361..7e968514c 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py @@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore( active = 0 peak = 0 - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): nonlocal active, peak async with lock: active += 1 @@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads( monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0) - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): await asyncio.sleep(0.05) return _mock_extract_ok(file["id"], file["name"]) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py index 12a912b03..396d79e73 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_onedrive_parallel.py @@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore( active = 0 peak = 0 - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): nonlocal active, peak async with lock: active += 1 @@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads( monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0) - async def _slow_extract(client, file): + async def _slow_extract(client, file, **kwargs): await asyncio.sleep(0.05) return _mock_extract_ok(file["id"], file["name"]) diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index 9608b011d..1a94d4263 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -431,7 +431,7 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker): mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True) mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True) - with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"): + with pytest.raises(EtlUnsupportedFileError, match="document parser does not support this format"): await EtlPipelineService().extract( EtlRequest(file_path=str(heif_file), filename="photo.heif") ) @@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename): ("doc.docx", "document"), ("slides.pptx", "document"), ("sheet.xlsx", "document"), - ("photo.png", "document"), - ("photo.jpg", "document"), + ("photo.png", "image"), + ("photo.jpg", "image"), + ("photo.webp", "image"), + ("photo.gif", "image"), + ("photo.heic", "image"), ("book.epub", "document"), ("letter.odt", "document"), ("readme.md", "plaintext"), @@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): await EtlPipelineService().extract( EtlRequest(file_path=str(eml_file), filename="mail.eml") ) + + +# --------------------------------------------------------------------------- +# Image extraction via vision LLM +# --------------------------------------------------------------------------- + + +async def test_extract_image_with_vision_llm(tmp_path): + """An image file is analyzed by the vision LLM when provided.""" + from unittest.mock import AsyncMock, MagicMock + + img_file = tmp_path / "photo.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + fake_response = MagicMock() + fake_response.content = "# A photo of a sunset over the ocean" + fake_llm = AsyncMock() + fake_llm.ainvoke.return_value = fake_response + + service = EtlPipelineService(vision_llm=fake_llm) + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="photo.png") + ) + + assert result.markdown_content == "# A photo of a sunset over the ocean" + assert result.etl_service == "VISION_LLM" + assert result.content_type == "image" + fake_llm.ainvoke.assert_called_once() + + +async def test_extract_image_falls_back_to_document_without_vision_llm( + tmp_path, mocker +): + """Without a vision LLM, image files fall back to the document parser.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "# OCR text from image"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + img_file = tmp_path / "scan.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + service = EtlPipelineService() + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="scan.png") + ) + + assert result.markdown_content == "# OCR text from image" + assert result.etl_service == "DOCLING" + assert result.content_type == "document" diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py index 43dfef5f0..ccf5eb70f 100644 --- a/surfsense_backend/tests/unit/utils/test_file_extensions.py +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union(): ) assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS + + +# --------------------------------------------------------------------------- +# IMAGE_EXTENSIONS +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "ext", + [ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + ], +) +def test_image_extensions_contains_expected(ext): + from app.utils.file_extensions import IMAGE_EXTENSIONS + + assert ext in IMAGE_EXTENSIONS + + +def test_image_extensions_are_subset_of_document_extensions(): + """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback.""" + from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS + + missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS + assert not missing, ( + f"Image extensions missing from document sets (breaks fallback): {missing}" + ) diff --git a/surfsense_desktop/.env.example b/surfsense_desktop/.env.example new file mode 100644 index 000000000..e127b99e0 --- /dev/null +++ b/surfsense_desktop/.env.example @@ -0,0 +1,10 @@ +# Electron-specific build-time configuration. +# Set before running pnpm dist:mac / dist:win / dist:linux. + +# The hosted web frontend URL. Used to intercept OAuth redirects and keep them +# inside the desktop app. Set to your production frontend domain. +HOSTED_FRONTEND_URL=https://surfsense.net + +# PostHog analytics (leave empty to disable) +POSTHOG_KEY= +POSTHOG_HOST=https://assets.surfsense.com diff --git a/surfsense_desktop/.gitignore b/surfsense_desktop/.gitignore index 4bff253bb..70e5f15b9 100644 --- a/surfsense_desktop/.gitignore +++ b/surfsense_desktop/.gitignore @@ -1,3 +1,4 @@ node_modules/ dist/ release/ +.env \ No newline at end of file diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index c41e986d4..84361e25b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef { startIndexing(editingConnector.id); handleSaveConnector(() => refreshConnectors()); @@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef { if (indexingConfig.connectorId) { diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx new file mode 100644 index 000000000..e5ebdbd06 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/components/vision-llm-config.tsx @@ -0,0 +1,25 @@ +"use client"; + +import type { FC } from "react"; +import { Switch } from "@/components/ui/switch"; + +interface VisionLLMConfigProps { + enabled: boolean; + onEnabledChange: (enabled: boolean) => void; +} + +export const VisionLLMConfig: FC = ({ enabled, onEnabledChange }) => { + return ( +
+
+
+

Enable Vision LLM

+

+ Describes images using AI vision (costly, slower) +

+
+ +
+
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 7308e1e26..bea5d12e8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -15,6 +15,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { SummaryConfig } from "../../components/summary-config"; +import { VisionLLMConfig } from "../../components/vision-llm-config"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -38,6 +39,7 @@ interface ConnectorEditViewProps { periodicEnabled: boolean; frequencyMinutes: string; enableSummary: boolean; + enableVisionLlm: boolean; isSaving: boolean; isDisconnecting: boolean; isIndexing?: boolean; @@ -47,6 +49,7 @@ interface ConnectorEditViewProps { onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; onEnableSummaryChange: (enabled: boolean) => void; + onEnableVisionLlmChange: (enabled: boolean) => void; onSave: () => void; onDisconnect: () => void; onBack: () => void; @@ -62,6 +65,7 @@ export const ConnectorEditView: FC = ({ periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, isSaving, isDisconnecting, isIndexing = false, @@ -71,6 +75,7 @@ export const ConnectorEditView: FC = ({ onPeriodicEnabledChange, onFrequencyChange, onEnableSummaryChange, + onEnableVisionLlmChange, onSave, onDisconnect, onBack, @@ -272,6 +277,14 @@ export const ConnectorEditView: FC = ({ {/* AI Summary toggle */} + {/* Vision LLM toggle - only for file-based connectors */} + {(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || + connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || + connector.connector_type === "DROPBOX_CONNECTOR" || + connector.connector_type === "ONEDRIVE_CONNECTOR") && ( + + )} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index e583cbe17..cb7438cde 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -10,6 +10,7 @@ import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; import { SummaryConfig } from "../../components/summary-config"; +import { VisionLLMConfig } from "../../components/vision-llm-config"; import type { IndexingConfigState } from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps { periodicEnabled: boolean; frequencyMinutes: string; enableSummary: boolean; + enableVisionLlm: boolean; isStartingIndexing: boolean; isFromOAuth?: boolean; onStartDateChange: (date: Date | undefined) => void; @@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps { onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; onEnableSummaryChange: (enabled: boolean) => void; + onEnableVisionLlmChange: (enabled: boolean) => void; onConfigChange?: (config: Record) => void; onStartIndexing: () => void; onSkip: () => void; @@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC = ({ periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, isStartingIndexing, isFromOAuth = false, onStartDateChange, @@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC = ({ onPeriodicEnabledChange, onFrequencyChange, onEnableSummaryChange, + onEnableVisionLlmChange, onConfigChange, onStartIndexing, onSkip, @@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC = ({ {/* AI Summary toggle */} + {/* Vision LLM toggle - only for file-based connectors */} + {(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" || + config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" || + config.connectorType === "DROPBOX_CONNECTOR" || + config.connectorType === "ONEDRIVE_CONNECTOR") && ( + + )} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 6543bbd72..7331549b5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -80,6 +80,7 @@ export const useConnectorDialog = () => { const [periodicEnabled, setPeriodicEnabled] = useState(false); const [frequencyMinutes, setFrequencyMinutes] = useState("1440"); const [enableSummary, setEnableSummary] = useState(false); + const [enableVisionLlm, setEnableVisionLlm] = useState(false); // Edit mode state const [editingConnector, setEditingConnector] = useState(null); @@ -621,6 +622,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(false); setFrequencyMinutes("1440"); setEnableSummary(connector.enable_summary ?? false); + setEnableVisionLlm(connector.enable_vision_llm ?? false); setStartDate(undefined); setEndDate(undefined); @@ -763,12 +765,13 @@ export const useConnectorDialog = () => { const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; // Update connector with summary, periodic sync settings, and config changes - if (enableSummary || periodicEnabled || indexingConnectorConfig) { - const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; + if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) { + const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; await updateConnector({ id: indexingConfig.connectorId, data: { enable_summary: enableSummary, + enable_vision_llm: enableVisionLlm, ...(periodicEnabled && { periodic_indexing_enabled: true, indexing_frequency_minutes: frequency, @@ -896,6 +899,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, indexingConnectorConfig, setIsOpen, ] @@ -960,6 +964,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); setEnableSummary(connector.enable_summary ?? false); + setEnableVisionLlm(connector.enable_vision_llm ?? false); setStartDate(undefined); setEndDate(undefined); }, @@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => { data: { name: connectorName || editingConnector.name, enable_summary: enableSummary, + enable_vision_llm: enableVisionLlm, periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled, indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency, config: connectorConfig || editingConnector.config, @@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, getFrequencyLabel, connectorConfig, connectorName, @@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled(false); setFrequencyMinutes("1440"); setEnableSummary(false); + setEnableVisionLlm(false); } } }, @@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => { periodicEnabled, frequencyMinutes, enableSummary, + enableVisionLlm, searchSpaceId, allConnectors, viewingAccountsType, @@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => { setPeriodicEnabled, setFrequencyMinutes, setEnableSummary, + setEnableVisionLlm, setConnectorName, // Handlers diff --git a/surfsense_web/components/documents/DocumentsFilters.tsx b/surfsense_web/components/documents/DocumentsFilters.tsx index a795b61c7..703c9c3b4 100644 --- a/surfsense_web/components/documents/DocumentsFilters.tsx +++ b/surfsense_web/components/documents/DocumentsFilters.tsx @@ -1,6 +1,6 @@ "use client"; -import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; +import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import React, { useCallback, useMemo, useRef, useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; @@ -20,6 +20,8 @@ export function DocumentsFilters({ onToggleType, activeTypes, onCreateFolder, + onExportKB, + isExporting, }: { typeCounts: Partial>; onSearch: (v: string) => void; @@ -27,6 +29,8 @@ export function DocumentsFilters({ onToggleType: (type: DocumentTypeEnum, checked: boolean) => void; activeTypes: DocumentTypeEnum[]; onCreateFolder?: () => void; + onExportKB?: () => void; + isExporting?: boolean; }) { const t = useTranslations("documents"); const id = React.useId(); @@ -84,6 +88,31 @@ export function DocumentsFilters({ )} + {onExportKB && ( + + + { + e.preventDefault(); + onExportKB(); + }} + > + {isExporting ? ( + + ) : ( + + )} + + + + {isExporting ? "Exporting…" : "Export knowledge base"} + + + )} + diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 7f75f8abf..a1b437983 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -4,6 +4,7 @@ import { AlertCircle, ChevronDown, ChevronRight, + Download, Eye, EyeOff, Folder, @@ -80,6 +81,7 @@ interface FolderNodeProps { isWatched?: boolean; onRescan?: (folder: FolderDisplay) => void | Promise; onStopWatching?: (folder: FolderDisplay) => void; + onExportFolder?: (folder: FolderDisplay) => void; } function getDropZone( @@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({ isWatched, onRescan, onStopWatching, + onExportFolder, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({ Move to... + {onExportFolder && ( + { + e.stopPropagation(); + onExportFolder(folder); + }} + > + + Export folder + + )} { e.stopPropagation(); @@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({ Move to... + {onExportFolder && ( + onExportFolder(folder)}> + + Export folder + + )} onDelete(folder)}> Delete diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 6eb53da50..4988e87e7 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -44,6 +44,7 @@ interface FolderTreeViewProps { watchedFolderIds?: Set; onRescanFolder?: (folder: FolderDisplay) => void; onStopWatchingFolder?: (folder: FolderDisplay) => void; + onExportFolder?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -81,6 +82,7 @@ export function FolderTreeView({ watchedFolderIds, onRescanFolder, onStopWatchingFolder, + onExportFolder, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); @@ -259,6 +261,7 @@ export function FolderTreeView({ isWatched={watchedFolderIds?.has(f.id)} onRescan={onRescanFolder} onStopWatching={onStopWatchingFolder} + onExportFolder={onExportFolder} /> ); diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 8b3a119ae..20b25a2d2 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -406,6 +406,160 @@ export function DocumentsSidebar({ setFolderPickerOpen(true); }, []); + const [isExportingKB, setIsExportingKB] = useState(false); + const [exportWarningOpen, setExportWarningOpen] = useState(false); + const [exportWarningContext, setExportWarningContext] = useState<{ + type: "kb" | "folder"; + folder?: FolderDisplay; + pendingCount: number; + } | null>(null); + + const pendingDocuments = useMemo( + () => + treeDocuments.filter( + (d) => d.status?.state === "pending" || d.status?.state === "processing" + ), + [treeDocuments] + ); + + const doExport = useCallback(async (url: string, downloadName: string) => { + const response = await authenticatedFetch(url, { method: "GET" }); + if (!response.ok) { + const errorData = await response.json().catch(() => ({ detail: "Export failed" })); + throw new Error(errorData.detail || "Export failed"); + } + + const blob = await response.blob(); + const blobUrl = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = blobUrl; + a.download = downloadName; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(blobUrl); + }, []); + + const handleExportKB = useCallback(async () => { + if (isExportingKB) return; + + if (pendingDocuments.length > 0) { + setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length }); + setExportWarningOpen(true); + return; + } + + setIsExportingKB(true); + try { + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`, + "knowledge-base.zip" + ); + toast.success("Knowledge base exported"); + } catch (err) { + console.error("KB export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + }, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]); + + const handleExportWarningConfirm = useCallback(async () => { + setExportWarningOpen(false); + const ctx = exportWarningContext; + if (!ctx) return; + + if (ctx.type === "kb") { + setIsExportingKB(true); + try { + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`, + "knowledge-base.zip" + ); + toast.success("Knowledge base exported"); + } catch (err) { + console.error("KB export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + } else if (ctx.type === "folder" && ctx.folder) { + setIsExportingKB(true); + try { + const safeName = + ctx.folder.name + .replace(/[^a-zA-Z0-9 _-]/g, "_") + .trim() + .slice(0, 80) || "folder"; + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`, + `${safeName}.zip` + ); + toast.success(`Folder "${ctx.folder.name}" exported`); + } catch (err) { + console.error("Folder export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + } + setExportWarningContext(null); + }, [exportWarningContext, searchSpaceId, doExport]); + + const getPendingCountInSubtree = useCallback( + (folderId: number): number => { + const subtreeIds = new Set(); + function collect(id: number) { + subtreeIds.add(id); + for (const child of foldersByParent[String(id)] ?? []) { + collect(child.id); + } + } + collect(folderId); + return treeDocuments.filter( + (d) => + subtreeIds.has(d.folderId ?? -1) && + (d.status?.state === "pending" || d.status?.state === "processing") + ).length; + }, + [foldersByParent, treeDocuments] + ); + + const handleExportFolder = useCallback( + async (folder: FolderDisplay) => { + const folderPendingCount = getPendingCountInSubtree(folder.id); + if (folderPendingCount > 0) { + setExportWarningContext({ + type: "folder", + folder, + pendingCount: folderPendingCount, + }); + setExportWarningOpen(true); + return; + } + + setIsExportingKB(true); + try { + const safeName = + folder.name + .replace(/[^a-zA-Z0-9 _-]/g, "_") + .trim() + .slice(0, 80) || "folder"; + await doExport( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`, + `${safeName}.zip` + ); + toast.success(`Folder "${folder.name}" exported`); + } catch (err) { + console.error("Folder export failed:", err); + toast.error(err instanceof Error ? err.message : "Export failed"); + } finally { + setIsExportingKB(false); + } + }, + [searchSpaceId, getPendingCountInSubtree, doExport] + ); + const handleExportDocument = useCallback( async (doc: DocumentNodeDoc, format: string) => { const safeTitle = @@ -800,6 +954,8 @@ export function DocumentsSidebar({ onToggleType={onToggleType} activeTypes={activeTypes} onCreateFolder={() => handleCreateFolder(null)} + onExportKB={handleExportKB} + isExporting={isExportingKB} /> @@ -855,6 +1011,7 @@ export function DocumentsSidebar({ watchedFolderIds={watchedFolderIds} onRescanFolder={handleRescanFolder} onStopWatchingFolder={handleStopWatching} + onExportFolder={handleExportFolder} /> @@ -933,6 +1090,33 @@ export function DocumentsSidebar({ + + { + if (!open) { + setExportWarningOpen(false); + setExportWarningContext(null); + } + }} + > + + + Some documents are still processing + + {exportWarningContext?.pendingCount} document + {exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed + and will be excluded from the export. Do you want to continue? + + + + Cancel + + Export anyway + + + + ); diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 124354a49..e7f4451b8 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress"; import { Spinner } from "@/components/ui/spinner"; import { Switch } from "@/components/ui/switch"; import { useElectronAPI } from "@/hooks/use-platform"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; import { trackDocumentUploadFailure, trackDocumentUploadStarted, @@ -48,6 +49,77 @@ interface FileWithId { file: File; } +interface FolderEntry { + id: string; + file: File; + relativePath: string; +} + +interface FolderUploadData { + folderName: string; + entries: FolderEntry[]; +} + +interface FolderTreeNode { + name: string; + isFolder: boolean; + size?: number; + children: FolderTreeNode[]; +} + +function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] { + const root: FolderTreeNode = { name: "", isFolder: true, children: [] }; + + for (const entry of entries) { + const parts = entry.relativePath.split("/"); + let current = root; + + for (let i = 0; i < parts.length - 1; i++) { + let child = current.children.find((c) => c.name === parts[i] && c.isFolder); + if (!child) { + child = { name: parts[i], isFolder: true, children: [] }; + current.children.push(child); + } + current = child; + } + + current.children.push({ + name: parts[parts.length - 1], + isFolder: false, + size: entry.file.size, + children: [], + }); + } + + function sortNodes(node: FolderTreeNode) { + node.children.sort((a, b) => { + if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1; + return a.name.localeCompare(b.name); + }); + for (const child of node.children) sortNodes(child); + } + sortNodes(root); + + return root.children; +} + +function flattenTree( + nodes: FolderTreeNode[], + depth = 0 +): { name: string; isFolder: boolean; depth: number; size?: number }[] { + const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = []; + for (const node of nodes) { + items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size }); + if (node.isFolder && node.children.length > 0) { + items.push(...flattenTree(node.children, depth + 1)); + } + } + return items; +} + +const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024; +const FOLDER_BATCH_MAX_FILES = 10; + const MAX_FILE_SIZE_MB = 500; const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; @@ -64,11 +136,14 @@ export function DocumentUploadTab({ const [uploadProgress, setUploadProgress] = useState(0); const [accordionValue, setAccordionValue] = useState(""); const [shouldSummarize, setShouldSummarize] = useState(false); + const [useVisionLlm, setUseVisionLlm] = useState(false); const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); const folderInputRef = useRef(null); const progressIntervalRef = useRef | null>(null); + const [folderUpload, setFolderUpload] = useState(null); + const [isFolderUploading, setIsFolderUploading] = useState(false); useEffect(() => { return () => { @@ -105,6 +180,7 @@ export function DocumentUploadTab({ const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES); if (valid.length === 0) return; + setFolderUpload(null); setFiles((prev) => { const newEntries = valid.map((f) => ({ id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, @@ -159,6 +235,7 @@ export function DocumentUploadTab({ file: new File([fd.data], fd.name, { type: fd.mimeType }), }) ); + setFolderUpload(null); setFiles((prev) => [...prev, ...newFiles]); }, [electronAPI, supportedExtensionsSet, t]); @@ -167,18 +244,35 @@ export function DocumentUploadTab({ const fileList = e.target.files; if (!fileList || fileList.length === 0) return; - const folderFiles = Array.from(fileList).filter((f) => { - const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; - return ext !== "" && supportedExtensionsSet.has(ext); - }); + const allFiles = Array.from(fileList); + const firstPath = allFiles[0]?.webkitRelativePath || ""; + const folderName = firstPath.split("/")[0]; - if (folderFiles.length === 0) { + if (!folderName) { + addFiles(allFiles); + e.target.value = ""; + return; + } + + const entries: FolderEntry[] = allFiles + .filter((f) => { + const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; + return ext !== "" && supportedExtensionsSet.has(ext); + }) + .map((f) => ({ + id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, + file: f, + relativePath: f.webkitRelativePath.substring(folderName.length + 1), + })); + + if (entries.length === 0) { toast.error(t("no_supported_files_in_folder")); e.target.value = ""; return; } - addFiles(folderFiles); + setFiles([]); + setFolderUpload({ folderName, entries }); e.target.value = ""; }, [addFiles, supportedExtensionsSet, t] @@ -192,9 +286,18 @@ export function DocumentUploadTab({ return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`; }; - const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0); + const totalFileSize = folderUpload + ? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0) + : files.reduce((total, entry) => total + entry.file.size, 0); - const hasContent = files.length > 0; + const fileCount = folderUpload ? folderUpload.entries.length : files.length; + const hasContent = files.length > 0 || folderUpload !== null; + const isAnyUploading = isUploading || isFolderUploading; + + const folderTreeItems = useMemo(() => { + if (!folderUpload) return []; + return flattenTree(buildFolderTree(folderUpload.entries)); + }, [folderUpload]); const handleAccordionChange = useCallback( (value: string) => { @@ -204,7 +307,95 @@ export function DocumentUploadTab({ [onAccordionStateChange] ); + const handleFolderUpload = async () => { + if (!folderUpload) return; + + setUploadProgress(0); + setIsFolderUploading(true); + const total = folderUpload.entries.length; + trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize); + + try { + const batches: FolderEntry[][] = []; + let currentBatch: FolderEntry[] = []; + let currentSize = 0; + + for (const entry of folderUpload.entries) { + const size = entry.file.size; + + if (size >= FOLDER_BATCH_SIZE_BYTES) { + if (currentBatch.length > 0) { + batches.push(currentBatch); + currentBatch = []; + currentSize = 0; + } + batches.push([entry]); + continue; + } + + if ( + currentBatch.length >= FOLDER_BATCH_MAX_FILES || + currentSize + size > FOLDER_BATCH_SIZE_BYTES + ) { + batches.push(currentBatch); + currentBatch = []; + currentSize = 0; + } + + currentBatch.push(entry); + currentSize += size; + } + + if (currentBatch.length > 0) { + batches.push(currentBatch); + } + + let rootFolderId: number | null = null; + let uploaded = 0; + + for (const batch of batches) { + const result = await documentsApiService.folderUploadFiles( + batch.map((e) => e.file), + { + folder_name: folderUpload.folderName, + search_space_id: Number(searchSpaceId), + relative_paths: batch.map((e) => e.relativePath), + root_folder_id: rootFolderId, + enable_summary: shouldSummarize, + use_vision_llm: useVisionLlm, + } + ); + + if (result.root_folder_id && !rootFolderId) { + rootFolderId = result.root_folder_id; + } + + uploaded += batch.length; + setUploadProgress(Math.round((uploaded / total) * 100)); + } + + trackDocumentUploadSuccess(Number(searchSpaceId), total); + toast(t("upload_initiated"), { description: t("upload_initiated_desc") }); + setFolderUpload(null); + onSuccess?.(); + } catch (error) { + const message = error instanceof Error ? error.message : "Upload failed"; + trackDocumentUploadFailure(Number(searchSpaceId), message); + toast(t("upload_error"), { + description: `${t("upload_error_desc")}: ${message}`, + }); + } finally { + setIsFolderUploading(false); + setUploadProgress(0); + } + }; + const handleUpload = async () => { + if (folderUpload) { + await handleFolderUpload(); + return; + } + setUploadProgress(0); trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize); @@ -218,6 +409,7 @@ export function DocumentUploadTab({ files: rawFiles, search_space_id: Number(searchSpaceId), should_summarize: shouldSummarize, + use_vision_llm: useVisionLlm, }, { onSuccess: () => { @@ -341,28 +533,35 @@ export function DocumentUploadTab({ ) ) : ( - + {renderBrowseButton({ fullWidth: true })} + + )} @@ -398,55 +597,92 @@ export function DocumentUploadTab({ {/* FILES SELECTED */} - {files.length > 0 && ( + {hasContent && (

- {t("selected_files", { count: files.length })} - - {formatFileSize(totalFileSize)} + {folderUpload ? ( + <> + + {folderUpload.folderName} + + {folderUpload.entries.length}{" "} + {folderUpload.entries.length === 1 ? "file" : "files"} + + {formatFileSize(totalFileSize)} + + ) : ( + <> + {t("selected_files", { count: files.length })} + + {formatFileSize(totalFileSize)} + + )}

- {files.map((entry) => ( -
- - {entry.file.name.split(".").pop() || "?"} - - {entry.file.name} - - {formatFileSize(entry.file.size)} - - -
- ))} + {folderUpload + ? folderTreeItems.map((item, i) => ( +
+ {item.isFolder ? ( + + ) : ( + + )} + {item.name} + {!item.isFolder && item.size != null && ( + + {formatFileSize(item.size)} + + )} +
+ )) + : files.map((entry) => ( +
+ + {entry.file.name.split(".").pop() || "?"} + + {entry.file.name} + + {formatFileSize(entry.file.size)} + + +
+ ))}
- {isUploading && ( + {isAnyUploading && (
- {t("uploading_files")} + {folderUpload ? t("uploading_folder") : t("uploading_files")} {Math.round(uploadProgress)}%
@@ -463,19 +699,31 @@ export function DocumentUploadTab({
+
+
+

Enable Vision LLM

+

+ Describes images using AI vision (costly, slower) +

+
+ +
+ diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts index b83e05dcc..61d5ffc94 100644 --- a/surfsense_web/contracts/types/connector.types.ts +++ b/surfsense_web/contracts/types/connector.types.ts @@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({ last_indexed_at: z.string().nullable(), config: z.record(z.string(), z.any()), enable_summary: z.boolean().default(false), + enable_vision_llm: z.boolean().default(false), periodic_indexing_enabled: z.boolean(), indexing_frequency_minutes: z.number().nullable(), next_scheduled_at: z.string().nullable(), @@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({ last_indexed_at: true, config: true, enable_summary: true, + enable_vision_llm: true, periodic_indexing_enabled: true, indexing_frequency_minutes: true, next_scheduled_at: true, @@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({ last_indexed_at: true, config: true, enable_summary: true, + enable_vision_llm: true, periodic_indexing_enabled: true, indexing_frequency_minutes: true, next_scheduled_at: true, diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index a8a1a11d4..13175267c 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -148,6 +148,7 @@ export const uploadDocumentRequest = z.object({ files: z.array(z.instanceof(File)), search_space_id: z.number(), should_summarize: z.boolean().default(false), + use_vision_llm: z.boolean().default(false), }); export const uploadDocumentResponse = z.object({ diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index 584f2e212..407d8b644 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -127,7 +127,7 @@ class DocumentsApiService { throw new ValidationError(`Invalid request: ${errorMessage}`); } - const { files, search_space_id, should_summarize } = parsedRequest.data; + const { files, search_space_id, should_summarize, use_vision_llm } = parsedRequest.data; const UPLOAD_BATCH_SIZE = 5; const batches: File[][] = []; @@ -146,6 +146,7 @@ class DocumentsApiService { for (const file of batch) formData.append("files", file); formData.append("search_space_id", String(search_space_id)); formData.append("should_summarize", String(should_summarize)); + formData.append("use_vision_llm", String(use_vision_llm)); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 120_000); @@ -442,6 +443,7 @@ class DocumentsApiService { relative_paths: string[]; root_folder_id?: number | null; enable_summary?: boolean; + use_vision_llm?: boolean; }, signal?: AbortSignal ): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => { @@ -456,6 +458,7 @@ class DocumentsApiService { formData.append("root_folder_id", String(metadata.root_folder_id)); } formData.append("enable_summary", String(metadata.enable_summary ?? false)); + formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false)); const totalSize = files.reduce((acc, f) => acc + f.size, 0); const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000); diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 4672fc5d4..ed38b8da3 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -396,7 +396,11 @@ "supported_file_types": "Supported File Types", "file_too_large": "File Too Large", "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.", - "no_supported_files_in_folder": "No supported file types found in the selected folder." + "no_supported_files_in_folder": "No supported file types found in the selected folder.", + "uploading_folder": "Uploading folder…", + "upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})", + "select_files_or_folder": "Select files or folder", + "tap_select_files_or_folder": "Tap to select files or folder" }, "add_webpage": { "title": "Add Webpages for Crawling", diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json index f41aba2cc..920642b9b 100644 --- a/surfsense_web/messages/es.json +++ b/surfsense_web/messages/es.json @@ -396,7 +396,11 @@ "supported_file_types": "Tipos de archivo soportados", "file_too_large": "Archivo demasiado grande", "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.", - "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada." + "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.", + "uploading_folder": "Subiendo carpeta…", + "upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})", + "select_files_or_folder": "Seleccionar archivos o carpeta", + "tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta" }, "add_webpage": { "title": "Agregar páginas web para rastreo", diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json index 514acc06f..1e1069b0b 100644 --- a/surfsense_web/messages/hi.json +++ b/surfsense_web/messages/hi.json @@ -396,7 +396,11 @@ "supported_file_types": "समर्थित फ़ाइल प्रकार", "file_too_large": "फ़ाइल बहुत बड़ी है", "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।", - "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।" + "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।", + "uploading_folder": "फ़ोल्डर अपलोड हो रहा है…", + "upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})", + "select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें", + "tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें" }, "add_webpage": { "title": "क्रॉलिंग के लिए वेबपेज जोड़ें", diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json index 421033810..d4f879486 100644 --- a/surfsense_web/messages/pt.json +++ b/surfsense_web/messages/pt.json @@ -396,7 +396,11 @@ "supported_file_types": "Tipos de arquivo suportados", "file_too_large": "Arquivo muito grande", "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.", - "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada." + "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.", + "uploading_folder": "Enviando pasta…", + "upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})", + "select_files_or_folder": "Selecionar arquivos ou pasta", + "tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta" }, "add_webpage": { "title": "Adicionar páginas web para rastreamento", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 1e931726b..ac3acc650 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -380,7 +380,11 @@ "supported_file_types": "支持的文件类型", "file_too_large": "文件过大", "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。", - "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。" + "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。", + "uploading_folder": "正在上传文件夹…", + "upload_folder_button": "上传文件夹({count}个文件)", + "select_files_or_folder": "选择文件或文件夹", + "tap_select_files_or_folder": "点击选择文件或文件夹" }, "add_webpage": { "title": "添加网页爬取",