Merge pull request #1207 from CREDO23/feat/kb-export-and-folder-upload
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions

[Feat] KB Export, Folder Upload & Vision LLM for Image Processing
This commit is contained in:
Rohan Verma 2026-04-11 13:56:57 -07:00 committed by GitHub
commit 61b3f0d7e3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
47 changed files with 1399 additions and 107 deletions

View file

@ -0,0 +1,45 @@
"""123_add_enable_vision_llm_to_connectors
Revision ID: 123
Revises: 122
Create Date: 2026-04-09
Adds enable_vision_llm boolean column to search_source_connectors.
Defaults to False so vision LLM image processing is opt-in.
"""
from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "123"
down_revision: str | None = "122"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
conn = op.get_bind()
existing_columns = [
col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
]
if "enable_vision_llm" not in existing_columns:
op.add_column(
"search_source_connectors",
sa.Column(
"enable_vision_llm",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
)
def downgrade() -> None:
op.drop_column("search_source_connectors", "enable_vision_llm")

View file

@ -44,6 +44,8 @@ async def _export_paper_content(
async def download_and_extract_content(
client: DropboxClient,
file: dict[str, Any],
*,
vision_llm=None,
) -> tuple[str | None, dict[str, Any], str | None]:
"""Download a Dropbox file and extract its content as markdown.
@ -91,7 +93,7 @@ async def download_and_extract_content(
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService().extract(
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(file_path=temp_file_path, filename=file_name)
)
markdown = result.markdown_content

View file

@ -27,6 +27,8 @@ logger = logging.getLogger(__name__)
async def download_and_extract_content(
client: GoogleDriveClient,
file: dict[str, Any],
*,
vision_llm=None,
) -> tuple[str | None, dict[str, Any], str | None]:
"""Download a Google Drive file and extract its content as markdown.
@ -103,7 +105,9 @@ async def download_and_extract_content(
etl_filename = (
file_name + extension if is_google_workspace_file(mime_type) else file_name
)
markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
markdown = await _parse_file_to_markdown(
temp_file_path, etl_filename, vision_llm=vision_llm
)
return markdown, drive_metadata, None
except Exception as e:
@ -115,12 +119,14 @@ async def download_and_extract_content(
os.unlink(temp_file_path)
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
async def _parse_file_to_markdown(
file_path: str, filename: str, *, vision_llm=None
) -> str:
"""Parse a local file to markdown using the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService().extract(
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(file_path=file_path, filename=filename)
)
return result.markdown_content

View file

@ -16,6 +16,8 @@ logger = logging.getLogger(__name__)
async def download_and_extract_content(
client: OneDriveClient,
file: dict[str, Any],
*,
vision_llm=None,
) -> tuple[str | None, dict[str, Any], str | None]:
"""Download a OneDrive file and extract its content as markdown.
@ -65,7 +67,9 @@ async def download_and_extract_content(
if error:
return None, metadata, error
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
markdown = await _parse_file_to_markdown(
temp_file_path, file_name, vision_llm=vision_llm
)
return markdown, metadata, None
except Exception as e:
@ -77,12 +81,14 @@ async def download_and_extract_content(
os.unlink(temp_file_path)
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
async def _parse_file_to_markdown(
file_path: str, filename: str, *, vision_llm=None
) -> str:
"""Parse a local file to markdown using the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService().extract(
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(file_path=file_path, filename=filename)
)
return result.markdown_content

View file

@ -1450,6 +1450,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
Boolean, nullable=False, default=False, server_default="false"
)
# Vision LLM for image files - disabled by default to save cost/time.
# When enabled, images are described via a vision language model instead
# of falling back to the document parser.
enable_vision_llm = Column(
Boolean, nullable=False, default=False, server_default="false"
)
# Periodic indexing fields
periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
indexing_frequency_minutes = Column(Integer, nullable=True)

View file

@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
class EtlPipelineService:
"""Single pipeline for extracting markdown from files. All callers use this."""
def __init__(self, *, vision_llm=None):
self._vision_llm = vision_llm
async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename)
@ -47,8 +50,45 @@ class EtlPipelineService:
content_type="audio",
)
if category == FileCategory.IMAGE:
return await self._extract_image(request)
return await self._extract_document(request)
async def _extract_image(self, request: EtlRequest) -> EtlResult:
if self._vision_llm:
try:
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
content = await parse_with_vision_llm(
request.file_path, request.filename, self._vision_llm
)
return EtlResult(
markdown_content=content,
etl_service="VISION_LLM",
content_type="image",
)
except Exception:
logging.warning(
"Vision LLM failed for %s, falling back to document parser",
request.filename,
exc_info=True,
)
else:
logging.info(
"No vision LLM provided, falling back to document parser for %s",
request.filename,
)
try:
return await self._extract_document(request)
except (EtlUnsupportedFileError, EtlServiceUnavailableError):
raise EtlUnsupportedFileError(
f"Cannot process image {request.filename}: vision LLM "
f"{'failed' if self._vision_llm else 'not configured'} and "
f"document parser does not support this format"
) from None
async def _extract_document(self, request: EtlRequest) -> EtlResult:
from pathlib import PurePosixPath

View file

@ -3,6 +3,7 @@ from pathlib import PurePosixPath
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
get_document_extensions_for_service,
)
@ -105,6 +106,7 @@ class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
IMAGE = "image"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
if suffix in IMAGE_EXTENSIONS:
return FileCategory.IMAGE
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Document files are checked against the per-parser extension set.
skipped. Image and document files are checked against the per-parser
extension set (images fall back to the document parser when no vision LLM
is available, so the same service constraint applies).
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category == FileCategory.DOCUMENT:
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False

View file

@ -0,0 +1,64 @@
import asyncio
import base64
import os
from langchain_core.messages import HumanMessage
_PROMPT = (
"Describe this image in markdown. "
"Transcribe any visible text verbatim. "
"Be concise but complete — let the image content guide the level of detail."
)
_MAX_IMAGE_BYTES = (
5 * 1024 * 1024
) # 5 MB (Anthropic Claude's limit, the most restrictive)
_INVOKE_TIMEOUT_SECONDS = 120
_EXT_TO_MIME: dict[str, str] = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".bmp": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
".webp": "image/webp",
".svg": "image/svg+xml",
".heic": "image/heic",
".heif": "image/heif",
}
def _image_to_data_url(file_path: str) -> str:
file_size = os.path.getsize(file_path)
if file_size > _MAX_IMAGE_BYTES:
raise ValueError(
f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
)
ext = os.path.splitext(file_path)[1].lower()
mime_type = _EXT_TO_MIME.get(ext)
if not mime_type:
raise ValueError(f"Unsupported image extension {ext!r}: {file_path}")
with open(file_path, "rb") as f:
encoded = base64.b64encode(f.read()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
response = await asyncio.wait_for(
llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
)
text = response.content if hasattr(response, "content") else str(response)
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()

View file

@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router
from .documents_routes import router as documents_router
from .dropbox_add_connector_route import router as dropbox_add_connector_router
from .editor_routes import router as editor_router
from .export_routes import router as export_router
from .folders_routes import router as folders_router
from .google_calendar_add_connector_route import (
router as google_calendar_add_connector_router,
@ -58,6 +59,7 @@ router = APIRouter()
router.include_router(search_spaces_router)
router.include_router(rbac_router) # RBAC routes for roles, members, invites
router.include_router(editor_router)
router.include_router(export_router)
router.include_router(documents_router)
router.include_router(folders_router)
router.include_router(notes_router)

View file

@ -2,7 +2,7 @@
import asyncio
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
from pydantic import BaseModel as PydanticBaseModel
from pydantic import BaseModel as PydanticBaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
@ -123,6 +123,7 @@ async def create_documents_file_upload(
files: list[UploadFile],
search_space_id: int = Form(...),
should_summarize: bool = Form(False),
use_vision_llm: bool = Form(False),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
@ -272,6 +273,7 @@ async def create_documents_file_upload(
search_space_id=search_space_id,
user_id=str(user.id),
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
)
return {
@ -1395,10 +1397,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
mtime: float
_MAX_MTIME_CHECK_FILES = 10_000
class FolderMtimeCheckRequest(PydanticBaseModel):
folder_name: str
search_space_id: int
files: list[FolderMtimeCheckFile]
files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
class FolderUnlinkRequest(PydanticBaseModel):
@ -1487,6 +1492,7 @@ async def folder_upload(
relative_paths: str = Form(...),
root_folder_id: int | None = Form(None),
enable_summary: bool = Form(False),
use_vision_llm: bool = Form(False),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
@ -1531,6 +1537,23 @@ async def folder_upload(
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
)
from app.services.folder_service import MAX_FOLDER_DEPTH
max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
raise HTTPException(
status_code=400,
detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
)
if root_folder_id:
root_folder = await session.get(Folder, root_folder_id)
if not root_folder or root_folder.search_space_id != search_space_id:
raise HTTPException(
status_code=404, detail="Root folder not found in this search space"
)
if not root_folder_id:
watched_metadata = {
"watched": True,
@ -1565,7 +1588,8 @@ async def folder_upload(
async def _read_and_save(file: UploadFile, idx: int) -> dict:
content = await file.read()
filename = file.filename or rel_paths[idx].split("/")[-1]
raw_name = file.filename or rel_paths[idx]
filename = raw_name.split("/")[-1]
def _write_temp() -> str:
with tempfile.NamedTemporaryFile(
@ -1595,6 +1619,7 @@ async def folder_upload(
folder_name=folder_name,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
use_vision_llm=use_vision_llm,
file_mappings=list(file_mappings),
)

View file

@ -0,0 +1,61 @@
"""Routes for exporting knowledge base content as ZIP."""
import logging
import os
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Permission, User, get_async_session
from app.services.export_service import build_export_zip
from app.users import current_active_user
from app.utils.rbac import check_permission
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/search-spaces/{search_space_id}/export")
async def export_knowledge_base(
search_space_id: int,
folder_id: int | None = Query(None, description="Export only this folder's subtree"),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""Export documents as a ZIP of markdown files preserving folder structure."""
await check_permission(
session,
user,
search_space_id,
Permission.DOCUMENTS_READ.value,
"You don't have permission to export documents in this search space",
)
try:
result = await build_export_zip(session, search_space_id, folder_id)
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e)) from None
def stream_and_cleanup():
try:
with open(result.zip_path, "rb") as f:
while chunk := f.read(8192):
yield chunk
finally:
os.unlink(result.zip_path)
headers = {
"Content-Disposition": f'attachment; filename="{result.export_name}.zip"',
"Content-Length": str(result.zip_size),
}
if result.skipped_docs:
headers["X-Skipped-Documents"] = str(len(result.skipped_docs))
return StreamingResponse(
stream_and_cleanup(),
media_type="application/zip",
headers=headers,
)

View file

@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel):
last_indexed_at: datetime | None = None
config: dict[str, Any]
enable_summary: bool = False
enable_vision_llm: bool = False
periodic_indexing_enabled: bool = False
indexing_frequency_minutes: int | None = None
next_scheduled_at: datetime | None = None
@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel):
last_indexed_at: datetime | None = None
config: dict[str, Any] | None = None
enable_summary: bool | None = None
enable_vision_llm: bool | None = None
periodic_indexing_enabled: bool | None = None
indexing_frequency_minutes: int | None = None
next_scheduled_at: datetime | None = None

View file

@ -0,0 +1,200 @@
"""Service for exporting knowledge base content as a ZIP archive."""
import asyncio
import logging
import os
import tempfile
import zipfile
from dataclasses import dataclass, field
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import Chunk, Document, Folder
from app.services.folder_service import get_folder_subtree_ids
logger = logging.getLogger(__name__)
def _sanitize_filename(title: str) -> str:
safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip()
return safe[:80] or "document"
def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
"""Build a mapping of folder_id -> full path string (e.g. 'Research/AI')."""
id_to_folder = {f.id: f for f in folders}
cache: dict[int, str] = {}
def resolve(folder_id: int) -> str:
if folder_id in cache:
return cache[folder_id]
folder = id_to_folder[folder_id]
safe_name = _sanitize_filename(folder.name)
if folder.parent_id is None or folder.parent_id not in id_to_folder:
cache[folder_id] = safe_name
else:
cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}"
return cache[folder_id]
for f in folders:
resolve(f.id)
return cache
async def _get_document_markdown(
session: AsyncSession, document: Document
) -> str | None:
"""Resolve markdown content using the 3-tier fallback:
1. source_markdown 2. blocknote_document conversion 3. chunk concatenation
"""
if document.source_markdown is not None:
return document.source_markdown
if document.blocknote_document:
from app.utils.blocknote_to_markdown import blocknote_to_markdown
md = blocknote_to_markdown(document.blocknote_document)
if md:
return md
chunk_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document.id)
.order_by(Chunk.id)
)
chunks = chunk_result.scalars().all()
if chunks:
return "\n\n".join(chunks)
return None
@dataclass
class ExportResult:
zip_path: str
export_name: str
zip_size: int
skipped_docs: list[str] = field(default_factory=list)
async def build_export_zip(
session: AsyncSession,
search_space_id: int,
folder_id: int | None = None,
) -> ExportResult:
"""Build a ZIP archive of markdown documents preserving folder structure.
Returns an ExportResult with the path to the temp ZIP file.
The caller is responsible for streaming and cleaning up the file.
Raises ValueError if folder_id is provided but not found.
"""
if folder_id is not None:
folder = await session.get(Folder, folder_id)
if not folder or folder.search_space_id != search_space_id:
raise ValueError("Folder not found")
target_folder_ids = set(await get_folder_subtree_ids(session, folder_id))
else:
target_folder_ids = None
folder_query = select(Folder).where(Folder.search_space_id == search_space_id)
if target_folder_ids is not None:
folder_query = folder_query.where(Folder.id.in_(target_folder_ids))
folder_result = await session.execute(folder_query)
folders = list(folder_result.scalars().all())
folder_path_map = _build_folder_path_map(folders)
batch_size = 100
base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
if target_folder_ids is not None:
base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
base_doc_query = base_doc_query.order_by(Document.id)
fd, tmp_path = tempfile.mkstemp(suffix=".zip")
os.close(fd)
used_paths: dict[str, int] = {}
skipped_docs: list[str] = []
is_first_batch = True
try:
offset = 0
while True:
batch_query = base_doc_query.limit(batch_size).offset(offset)
batch_result = await session.execute(batch_query)
documents = list(batch_result.scalars().all())
if not documents:
break
entries: list[tuple[str, str]] = []
for doc in documents:
status = doc.status or {}
state = (
status.get("state", "ready")
if isinstance(status, dict)
else "ready"
)
if state in ("pending", "processing"):
skipped_docs.append(doc.title or "Untitled")
continue
markdown = await _get_document_markdown(session, doc)
if not markdown or not markdown.strip():
continue
if doc.folder_id and doc.folder_id in folder_path_map:
dir_path = folder_path_map[doc.folder_id]
else:
dir_path = ""
base_name = _sanitize_filename(doc.title or "Untitled")
file_path = (
f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
)
if file_path in used_paths:
used_paths[file_path] += 1
suffix = used_paths[file_path]
file_path = (
f"{dir_path}/{base_name}_{suffix}.md"
if dir_path
else f"{base_name}_{suffix}.md"
)
used_paths[file_path] = used_paths.get(file_path, 0) + 1
entries.append((file_path, markdown))
if entries:
mode = "w" if is_first_batch else "a"
batch_entries = entries
def _write_batch(m: str = mode, e: list = batch_entries) -> None:
with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
for path, content in e:
zf.writestr(path, content)
await asyncio.to_thread(_write_batch)
is_first_batch = False
offset += batch_size
export_name = "knowledge-base"
if folder_id is not None and folder_id in folder_path_map:
export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0])
return ExportResult(
zip_path=tmp_path,
export_name=export_name,
zip_size=os.path.getsize(tmp_path),
skipped_docs=skipped_docs,
)
except Exception:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
raise

View file

@ -19,6 +19,7 @@ class TaskDispatcher(Protocol):
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
) -> None: ...
@ -34,6 +35,7 @@ class CeleryTaskDispatcher:
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
) -> None:
from app.tasks.celery_tasks.document_tasks import (
process_file_upload_with_document_task,
@ -46,6 +48,7 @@ class CeleryTaskDispatcher:
search_space_id=search_space_id,
user_id=user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
)

View file

@ -778,6 +778,7 @@ def process_file_upload_with_document_task(
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
):
"""
Celery task to process uploaded file with existing pending document.
@ -833,6 +834,7 @@ def process_file_upload_with_document_task(
search_space_id,
user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
)
)
logger.info(
@ -869,6 +871,7 @@ async def _process_file_with_document(
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
):
"""
Process file and update existing pending document status.
@ -971,6 +974,7 @@ async def _process_file_with_document(
log_entry=log_entry,
notification=notification,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
)
# Update notification on success
@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task(
root_folder_id: int,
enable_summary: bool,
file_mappings: list[dict],
use_vision_llm: bool = False,
):
"""Celery task to index files uploaded from the desktop app."""
loop = asyncio.new_event_loop()
@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task(
root_folder_id=root_folder_id,
enable_summary=enable_summary,
file_mappings=file_mappings,
use_vision_llm=use_vision_llm,
)
)
finally:
@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async(
root_folder_id: int,
enable_summary: bool,
file_mappings: list[dict],
use_vision_llm: bool = False,
):
"""Run upload-based folder indexing with notification + heartbeat."""
file_count = len(file_mappings)
@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async(
enable_summary=enable_summary,
file_mappings=file_mappings,
on_heartbeat_callback=_heartbeat_progress,
use_vision_llm=use_vision_llm,
)
if notification:

View file

@ -164,6 +164,7 @@ async def _download_files_parallel(
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[list[ConnectorDocument], int]:
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
results: list[ConnectorDocument] = []
@ -176,7 +177,7 @@ async def _download_files_parallel(
nonlocal last_heartbeat, completed_count
async with sem:
markdown, db_metadata, error = await download_and_extract_content(
dropbox_client, file
dropbox_client, file, vision_llm=vision_llm
)
if error or not markdown:
file_name = file.get("name", "Unknown")
@ -224,6 +225,7 @@ async def _download_and_index(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
connector_docs, download_failed = await _download_files_parallel(
@ -234,6 +236,7 @@ async def _download_and_index(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
batch_indexed = 0
@ -287,6 +290,7 @@ async def _index_with_delta_sync(
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int, str]:
"""Delta sync using Dropbox cursor-based change tracking.
@ -359,6 +363,7 @@ async def _index_with_delta_sync(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
indexed = renamed_count + batch_indexed
@ -384,6 +389,7 @@ async def _index_full_scan(
incremental_sync: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -469,6 +475,7 @@ async def _index_full_scan(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -498,6 +505,7 @@ async def _index_selected_files(
enable_summary: bool,
incremental_sync: bool = True,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
@ -557,6 +565,7 @@ async def _index_selected_files(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -621,6 +630,13 @@ async def index_dropbox_files(
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
dropbox_client = DropboxClient(session, connector_id)
indexing_options = items_dict.get("indexing_options", {})
@ -650,6 +666,7 @@ async def index_dropbox_files(
user_id=user_id,
enable_summary=connector_enable_summary,
incremental_sync=incremental_sync,
vision_llm=vision_llm,
)
total_indexed += indexed
total_skipped += skipped
@ -684,6 +701,7 @@ async def index_dropbox_files(
log_entry,
max_files,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
folder_cursors[folder_path] = new_cursor
total_unsupported += unsup
@ -703,6 +721,7 @@ async def index_dropbox_files(
include_subfolders,
incremental_sync=incremental_sync,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_unsupported += unsup

View file

@ -261,6 +261,7 @@ async def _download_files_parallel(
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[list[ConnectorDocument], int]:
"""Download and ETL files in parallel, returning ConnectorDocuments.
@ -276,7 +277,7 @@ async def _download_files_parallel(
nonlocal last_heartbeat, completed_count
async with sem:
markdown, drive_metadata, error = await download_and_extract_content(
drive_client, file
drive_client, file, vision_llm=vision_llm
)
if error or not markdown:
file_name = file.get("name", "Unknown")
@ -322,6 +323,7 @@ async def _process_single_file(
search_space_id: int,
user_id: str,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Download, extract, and index a single Drive file via the pipeline.
@ -343,7 +345,7 @@ async def _process_single_file(
await page_limit_service.check_page_limit(user_id, estimated_pages)
markdown, drive_metadata, error = await download_and_extract_content(
drive_client, file
drive_client, file, vision_llm=vision_llm
)
if error or not markdown:
logger.warning(f"ETL failed for {file_name}: {error}")
@ -433,6 +435,7 @@ async def _download_and_index(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
"""Phase 2+3: parallel download then parallel indexing.
@ -446,6 +449,7 @@ async def _download_and_index(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
batch_indexed = 0
@ -476,6 +480,7 @@ async def _index_selected_files(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline.
@ -540,6 +545,7 @@ async def _index_selected_files(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -573,6 +579,7 @@ async def _index_full_scan(
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -703,6 +710,7 @@ async def _index_full_scan(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -736,6 +744,7 @@ async def _index_with_delta_sync(
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Delta sync using change tracking.
@ -844,6 +853,7 @@ async def _index_with_delta_sync(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -947,6 +957,11 @@ async def index_google_drive_files(
)
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
drive_client = GoogleDriveClient(
session, connector_id, credentials=pre_built_credentials
)
@ -986,6 +1001,7 @@ async def index_google_drive_files(
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
documents_unsupported += du
logger.info("Running reconciliation scan after delta sync")
@ -1004,6 +1020,7 @@ async def index_google_drive_files(
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
documents_indexed += ri
documents_skipped += rs
@ -1029,6 +1046,7 @@ async def index_google_drive_files(
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
if documents_indexed > 0 or can_use_delta:
@ -1146,6 +1164,11 @@ async def index_google_drive_single_file(
)
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
drive_client = GoogleDriveClient(
session, connector_id, credentials=pre_built_credentials
)
@ -1168,6 +1191,7 @@ async def index_google_drive_single_file(
search_space_id,
user_id,
connector_enable_summary,
vision_llm=vision_llm,
)
await session.commit()
@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files(
return 0, 0, [error_msg]
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
drive_client = GoogleDriveClient(
session, connector_id, credentials=pre_built_credentials
)
@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files(
user_id=user_id,
enable_summary=connector_enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if unsupported > 0:

View file

@ -153,16 +153,16 @@ def scan_folder(
return files
async def _read_file_content(file_path: str, filename: str) -> str:
async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
"""Read file content via the unified ETL pipeline.
All file types (plaintext, audio, direct-convert, document) are handled
by ``EtlPipelineService``.
All file types (plaintext, audio, direct-convert, document, image) are
handled by ``EtlPipelineService``.
"""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
result = await EtlPipelineService().extract(
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(file_path=file_path, filename=filename)
)
return result.markdown_content
@ -199,12 +199,14 @@ async def _compute_file_content_hash(
file_path: str,
filename: str,
search_space_id: int,
*,
vision_llm=None,
) -> tuple[str, str]:
"""Read a file (via ETL if needed) and compute its content hash.
Returns (content_text, content_hash).
"""
content = await _read_file_content(file_path, filename)
content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
return content, _content_hash(content, search_space_id)
@ -704,7 +706,9 @@ async def index_local_folder(
try:
content, content_hash = await _compute_file_content_hash(
file_path_abs, file_info["relative_path"], search_space_id
file_path_abs,
file_info["relative_path"],
search_space_id,
)
except Exception as read_err:
logger.warning(f"Could not read {file_path_abs}: {read_err}")
@ -738,7 +742,9 @@ async def index_local_folder(
try:
content, content_hash = await _compute_file_content_hash(
file_path_abs, file_info["relative_path"], search_space_id
file_path_abs,
file_info["relative_path"],
search_space_id,
)
except Exception as read_err:
logger.warning(f"Could not read {file_path_abs}: {read_err}")
@ -1264,6 +1270,7 @@ async def index_uploaded_files(
enable_summary: bool,
file_mappings: list[dict],
on_heartbeat_callback: HeartbeatCallbackType | None = None,
use_vision_llm: bool = False,
) -> tuple[int, int, str | None]:
"""Index files uploaded from the desktop app via temp paths.
@ -1300,6 +1307,12 @@ async def index_uploaded_files(
pipeline = IndexingPipelineService(session)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
vision_llm_instance = None
if use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm_instance = await get_vision_llm(session, search_space_id)
indexed_count = 0
failed_count = 0
errors: list[str] = []
@ -1347,7 +1360,8 @@ async def index_uploaded_files(
try:
content, content_hash = await _compute_file_content_hash(
temp_path, filename, search_space_id
temp_path, filename, search_space_id,
vision_llm=vision_llm_instance,
)
except Exception as e:
logger.warning(f"Could not read {relative_path}: {e}")

View file

@ -171,6 +171,7 @@ async def _download_files_parallel(
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[list[ConnectorDocument], int]:
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
results: list[ConnectorDocument] = []
@ -183,7 +184,7 @@ async def _download_files_parallel(
nonlocal last_heartbeat, completed_count
async with sem:
markdown, od_metadata, error = await download_and_extract_content(
onedrive_client, file
onedrive_client, file, vision_llm=vision_llm
)
if error or not markdown:
file_name = file.get("name", "Unknown")
@ -231,6 +232,7 @@ async def _download_and_index(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
connector_docs, download_failed = await _download_files_parallel(
@ -241,6 +243,7 @@ async def _download_and_index(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
batch_indexed = 0
@ -293,6 +296,7 @@ async def _index_selected_files(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
@ -343,6 +347,7 @@ async def _index_selected_files(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -375,6 +380,7 @@ async def _index_full_scan(
include_subfolders: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -450,6 +456,7 @@ async def _index_full_scan(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -481,6 +488,7 @@ async def _index_with_delta_sync(
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int, str | None]:
"""Delta sync using OneDrive change tracking.
@ -573,6 +581,7 @@ async def _index_with_delta_sync(
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
@ -643,6 +652,12 @@ async def index_onedrive_files(
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
onedrive_client = OneDriveClient(session, connector_id)
indexing_options = items_dict.get("indexing_options", {})
@ -666,6 +681,7 @@ async def index_onedrive_files(
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed
total_skipped += skipped
@ -695,6 +711,7 @@ async def index_onedrive_files(
log_entry,
max_files,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed
total_skipped += skipped
@ -721,6 +738,7 @@ async def index_onedrive_files(
max_files,
include_subfolders,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += ri
total_skipped += rs
@ -740,6 +758,7 @@ async def index_onedrive_files(
max_files,
include_subfolders,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed
total_skipped += skipped

View file

@ -46,6 +46,7 @@ class _ProcessingContext:
log_entry: Log
connector: dict | None = None
notification: Notification | None = None
use_vision_llm: bool = False
enable_summary: bool = field(init=False)
def __post_init__(self) -> None:
@ -118,9 +119,13 @@ async def _log_page_divergence(
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
"""Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.etl_pipeline.file_classifier import (
FileCategory,
classify_file as etl_classify,
)
await _notify(ctx, "parsing", "Processing file")
await ctx.task_logger.log_task_progress(
@ -129,7 +134,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
{"processing_stage": "extracting"},
)
etl_result = await EtlPipelineService().extract(
vision_llm = None
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
)
@ -278,6 +289,7 @@ async def process_file_in_background(
log_entry: Log,
connector: dict | None = None,
notification: Notification | None = None,
use_vision_llm: bool = False,
) -> Document | None:
ctx = _ProcessingContext(
session=session,
@ -289,6 +301,7 @@ async def process_file_in_background(
log_entry=log_entry,
connector=connector,
notification=notification,
use_vision_llm=use_vision_llm,
)
try:
@ -333,11 +346,13 @@ async def process_file_in_background(
async def _extract_file_content(
file_path: str,
filename: str,
search_space_id: int,
session: AsyncSession,
user_id: str,
task_logger: TaskLoggingService,
log_entry: Log,
notification: Notification | None,
use_vision_llm: bool = False,
) -> tuple[str, str]:
"""
Extract markdown content from a file regardless of type.
@ -360,6 +375,7 @@ async def _extract_file_content(
FileCategory.PLAINTEXT: "Reading file",
FileCategory.DIRECT_CONVERT: "Converting file",
FileCategory.AUDIO: "Transcribing audio",
FileCategory.IMAGE: "Analyzing image",
FileCategory.UNSUPPORTED: "Unsupported file type",
FileCategory.DOCUMENT: "Extracting content",
}
@ -383,7 +399,13 @@ async def _extract_file_content(
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
await page_limit_service.check_page_limit(user_id, estimated_pages)
result = await EtlPipelineService().extract(
vision_llm = None
if use_vision_llm and category == FileCategory.IMAGE:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(
file_path=file_path,
filename=filename,
@ -417,6 +439,7 @@ async def process_file_in_background_with_document(
connector: dict | None = None,
notification: Notification | None = None,
should_summarize: bool = False,
use_vision_llm: bool = False,
) -> Document | None:
"""
Process file and update existing pending document (2-phase pattern).
@ -439,11 +462,13 @@ async def process_file_in_background_with_document(
markdown_content, etl_service = await _extract_file_content(
file_path,
filename,
search_space_id,
session,
user_id,
task_logger,
log_entry,
notification,
use_vision_llm=use_vision_llm,
)
if not markdown_content:

View file

@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
Unstructured).
Image extensions intentionally remain in the per-parser sets for fallback
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
"""
from pathlib import PurePosixPath
# ---------------------------------------------------------------------------
# Image extensions (used by file_classifier for routing to vision LLM)
# ---------------------------------------------------------------------------
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
{
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".heic",
".heif",
}
)
# ---------------------------------------------------------------------------
# Per-parser document extension sets (from official documentation)
# ---------------------------------------------------------------------------

View file

@ -69,6 +69,7 @@ class InlineTaskDispatcher:
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
) -> None:
from app.tasks.celery_tasks.document_tasks import (
_process_file_with_document,
@ -82,6 +83,7 @@ class InlineTaskDispatcher:
search_space_id,
user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
)

View file

@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore(
active = 0
peak = 0
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
nonlocal active, peak
async with lock:
active += 1
@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
await asyncio.sleep(0.05)
return _mock_extract_ok(file["id"], file["name"])

View file

@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
active = 0
peak = 0
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
nonlocal active, peak
async with lock:
active += 1
@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
await asyncio.sleep(0.05)
return _mock_extract_ok(file["id"], file["name"])

View file

@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
active = 0
peak = 0
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
nonlocal active, peak
async with lock:
active += 1
@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
async def _slow_extract(client, file):
async def _slow_extract(client, file, **kwargs):
await asyncio.sleep(0.05)
return _mock_extract_ok(file["id"], file["name"])

View file

@ -431,7 +431,7 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
with pytest.raises(EtlUnsupportedFileError, match="document parser does not support this format"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(heif_file), filename="photo.heif")
)
@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
("doc.docx", "document"),
("slides.pptx", "document"),
("sheet.xlsx", "document"),
("photo.png", "document"),
("photo.jpg", "document"),
("photo.png", "image"),
("photo.jpg", "image"),
("photo.webp", "image"),
("photo.gif", "image"),
("photo.heic", "image"),
("book.epub", "document"),
("letter.odt", "document"),
("readme.md", "plaintext"),
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
await EtlPipelineService().extract(
EtlRequest(file_path=str(eml_file), filename="mail.eml")
)
# ---------------------------------------------------------------------------
# Image extraction via vision LLM
# ---------------------------------------------------------------------------
async def test_extract_image_with_vision_llm(tmp_path):
"""An image file is analyzed by the vision LLM when provided."""
from unittest.mock import AsyncMock, MagicMock
img_file = tmp_path / "photo.png"
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
fake_response = MagicMock()
fake_response.content = "# A photo of a sunset over the ocean"
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
service = EtlPipelineService(vision_llm=fake_llm)
result = await service.extract(
EtlRequest(file_path=str(img_file), filename="photo.png")
)
assert result.markdown_content == "# A photo of a sunset over the ocean"
assert result.etl_service == "VISION_LLM"
assert result.content_type == "image"
fake_llm.ainvoke.assert_called_once()
async def test_extract_image_falls_back_to_document_without_vision_llm(
tmp_path, mocker
):
"""Without a vision LLM, image files fall back to the document parser."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# OCR text from image"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
img_file = tmp_path / "scan.png"
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
service = EtlPipelineService()
result = await service.extract(
EtlRequest(file_path=str(img_file), filename="scan.png")
)
assert result.markdown_content == "# OCR text from image"
assert result.etl_service == "DOCLING"
assert result.content_type == "document"

View file

@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
# ---------------------------------------------------------------------------
# IMAGE_EXTENSIONS
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"ext",
[
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".heic",
".heif",
],
)
def test_image_extensions_contains_expected(ext):
from app.utils.file_extensions import IMAGE_EXTENSIONS
assert ext in IMAGE_EXTENSIONS
def test_image_extensions_are_subset_of_document_extensions():
"""Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
assert not missing, (
f"Image extensions missing from document sets (breaks fallback): {missing}"
)

View file

@ -0,0 +1,10 @@
# Electron-specific build-time configuration.
# Set before running pnpm dist:mac / dist:win / dist:linux.
# The hosted web frontend URL. Used to intercept OAuth redirects and keep them
# inside the desktop app. Set to your production frontend domain.
HOSTED_FRONTEND_URL=https://surfsense.net
# PostHog analytics (leave empty to disable)
POSTHOG_KEY=
POSTHOG_HOST=https://assets.surfsense.com

View file

@ -1,3 +1,4 @@
node_modules/
dist/
release/
.env

View file

@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
allConnectors,
viewingAccountsType,
viewingMCPList,
@ -109,6 +110,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
setPeriodicEnabled,
setFrequencyMinutes,
setEnableSummary,
setEnableVisionLlm,
handleOpenChange,
handleTabChange,
handleScroll,
@ -279,6 +281,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
periodicEnabled={periodicEnabled}
frequencyMinutes={frequencyMinutes}
enableSummary={enableSummary}
enableVisionLlm={enableVisionLlm}
isSaving={isSaving}
isDisconnecting={isDisconnecting}
isIndexing={indexingConnectorIds.has(editingConnector.id)}
@ -288,6 +291,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
onPeriodicEnabledChange={setPeriodicEnabled}
onFrequencyChange={setFrequencyMinutes}
onEnableSummaryChange={setEnableSummary}
onEnableVisionLlmChange={setEnableVisionLlm}
onSave={() => {
startIndexing(editingConnector.id);
handleSaveConnector(() => refreshConnectors());
@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
periodicEnabled={periodicEnabled}
frequencyMinutes={frequencyMinutes}
enableSummary={enableSummary}
enableVisionLlm={enableVisionLlm}
isStartingIndexing={isStartingIndexing}
isFromOAuth={isFromOAuth}
onStartDateChange={setStartDate}
@ -343,6 +348,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
onPeriodicEnabledChange={setPeriodicEnabled}
onFrequencyChange={setFrequencyMinutes}
onEnableSummaryChange={setEnableSummary}
onEnableVisionLlmChange={setEnableVisionLlm}
onConfigChange={setIndexingConnectorConfig}
onStartIndexing={() => {
if (indexingConfig.connectorId) {

View file

@ -0,0 +1,25 @@
"use client";
import type { FC } from "react";
import { Switch } from "@/components/ui/switch";
interface VisionLLMConfigProps {
enabled: boolean;
onEnabledChange: (enabled: boolean) => void;
}
export const VisionLLMConfig: FC<VisionLLMConfigProps> = ({ enabled, onEnabledChange }) => {
return (
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
<div className="flex items-center justify-between">
<div className="space-y-1">
<h3 className="font-medium text-sm sm:text-base">Enable Vision LLM</h3>
<p className="text-xs sm:text-sm text-muted-foreground">
Describes images using AI vision (costly, slower)
</p>
</div>
<Switch checked={enabled} onCheckedChange={onEnabledChange} />
</div>
</div>
);
};

View file

@ -15,6 +15,7 @@ import { cn } from "@/lib/utils";
import { DateRangeSelector } from "../../components/date-range-selector";
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
import { SummaryConfig } from "../../components/summary-config";
import { VisionLLMConfig } from "../../components/vision-llm-config";
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
import { getConnectorConfigComponent } from "../index";
@ -38,6 +39,7 @@ interface ConnectorEditViewProps {
periodicEnabled: boolean;
frequencyMinutes: string;
enableSummary: boolean;
enableVisionLlm: boolean;
isSaving: boolean;
isDisconnecting: boolean;
isIndexing?: boolean;
@ -47,6 +49,7 @@ interface ConnectorEditViewProps {
onPeriodicEnabledChange: (enabled: boolean) => void;
onFrequencyChange: (frequency: string) => void;
onEnableSummaryChange: (enabled: boolean) => void;
onEnableVisionLlmChange: (enabled: boolean) => void;
onSave: () => void;
onDisconnect: () => void;
onBack: () => void;
@ -62,6 +65,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
isSaving,
isDisconnecting,
isIndexing = false,
@ -71,6 +75,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
onPeriodicEnabledChange,
onFrequencyChange,
onEnableSummaryChange,
onEnableVisionLlmChange,
onSave,
onDisconnect,
onBack,
@ -272,6 +277,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
{/* AI Summary toggle */}
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
{/* Vision LLM toggle - only for file-based connectors */}
{(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
connector.connector_type === "DROPBOX_CONNECTOR" ||
connector.connector_type === "ONEDRIVE_CONNECTOR") && (
<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
)}
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&

View file

@ -10,6 +10,7 @@ import { cn } from "@/lib/utils";
import { DateRangeSelector } from "../../components/date-range-selector";
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
import { SummaryConfig } from "../../components/summary-config";
import { VisionLLMConfig } from "../../components/vision-llm-config";
import type { IndexingConfigState } from "../../constants/connector-constants";
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
import { getConnectorConfigComponent } from "../index";
@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps {
periodicEnabled: boolean;
frequencyMinutes: string;
enableSummary: boolean;
enableVisionLlm: boolean;
isStartingIndexing: boolean;
isFromOAuth?: boolean;
onStartDateChange: (date: Date | undefined) => void;
@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps {
onPeriodicEnabledChange: (enabled: boolean) => void;
onFrequencyChange: (frequency: string) => void;
onEnableSummaryChange: (enabled: boolean) => void;
onEnableVisionLlmChange: (enabled: boolean) => void;
onConfigChange?: (config: Record<string, unknown>) => void;
onStartIndexing: () => void;
onSkip: () => void;
@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
isStartingIndexing,
isFromOAuth = false,
onStartDateChange,
@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
onPeriodicEnabledChange,
onFrequencyChange,
onEnableSummaryChange,
onEnableVisionLlmChange,
onConfigChange,
onStartIndexing,
onSkip,
@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
{/* AI Summary toggle */}
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
{/* Vision LLM toggle - only for file-based connectors */}
{(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" ||
config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
config.connectorType === "DROPBOX_CONNECTOR" ||
config.connectorType === "ONEDRIVE_CONNECTOR") && (
<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
)}
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&

View file

@ -80,6 +80,7 @@ export const useConnectorDialog = () => {
const [periodicEnabled, setPeriodicEnabled] = useState(false);
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
const [enableSummary, setEnableSummary] = useState(false);
const [enableVisionLlm, setEnableVisionLlm] = useState(false);
// Edit mode state
const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
@ -621,6 +622,7 @@ export const useConnectorDialog = () => {
setPeriodicEnabled(false);
setFrequencyMinutes("1440");
setEnableSummary(connector.enable_summary ?? false);
setEnableVisionLlm(connector.enable_vision_llm ?? false);
setStartDate(undefined);
setEndDate(undefined);
@ -763,12 +765,13 @@ export const useConnectorDialog = () => {
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
// Update connector with summary, periodic sync settings, and config changes
if (enableSummary || periodicEnabled || indexingConnectorConfig) {
if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) {
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
await updateConnector({
id: indexingConfig.connectorId,
data: {
enable_summary: enableSummary,
enable_vision_llm: enableVisionLlm,
...(periodicEnabled && {
periodic_indexing_enabled: true,
indexing_frequency_minutes: frequency,
@ -896,6 +899,7 @@ export const useConnectorDialog = () => {
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
indexingConnectorConfig,
setIsOpen,
]
@ -960,6 +964,7 @@ export const useConnectorDialog = () => {
setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
setEnableSummary(connector.enable_summary ?? false);
setEnableVisionLlm(connector.enable_vision_llm ?? false);
setStartDate(undefined);
setEndDate(undefined);
},
@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => {
data: {
name: connectorName || editingConnector.name,
enable_summary: enableSummary,
enable_vision_llm: enableVisionLlm,
periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
config: connectorConfig || editingConnector.config,
@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => {
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
getFrequencyLabel,
connectorConfig,
connectorName,
@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => {
setPeriodicEnabled(false);
setFrequencyMinutes("1440");
setEnableSummary(false);
setEnableVisionLlm(false);
}
}
},
@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => {
periodicEnabled,
frequencyMinutes,
enableSummary,
enableVisionLlm,
searchSpaceId,
allConnectors,
viewingAccountsType,
@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => {
setPeriodicEnabled,
setFrequencyMinutes,
setEnableSummary,
setEnableVisionLlm,
setConnectorName,
// Handlers

View file

@ -1,6 +1,6 @@
"use client";
import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react";
import { useTranslations } from "next-intl";
import React, { useCallback, useMemo, useRef, useState } from "react";
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
@ -20,6 +20,8 @@ export function DocumentsFilters({
onToggleType,
activeTypes,
onCreateFolder,
onExportKB,
isExporting,
}: {
typeCounts: Partial<Record<DocumentTypeEnum, number>>;
onSearch: (v: string) => void;
@ -27,6 +29,8 @@ export function DocumentsFilters({
onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
activeTypes: DocumentTypeEnum[];
onCreateFolder?: () => void;
onExportKB?: () => void;
isExporting?: boolean;
}) {
const t = useTranslations("documents");
const id = React.useId();
@ -84,6 +88,31 @@ export function DocumentsFilters({
</Tooltip>
)}
{onExportKB && (
<Tooltip>
<TooltipTrigger asChild>
<ToggleGroupItem
value="export"
disabled={isExporting}
className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
onClick={(e) => {
e.preventDefault();
onExportKB();
}}
>
{isExporting ? (
<Loader2 size={14} className="animate-spin" />
) : (
<Download size={14} />
)}
</ToggleGroupItem>
</TooltipTrigger>
<TooltipContent>
{isExporting ? "Exporting…" : "Export knowledge base"}
</TooltipContent>
</Tooltip>
)}
<Popover>
<Tooltip>
<TooltipTrigger asChild>

View file

@ -4,6 +4,7 @@ import {
AlertCircle,
ChevronDown,
ChevronRight,
Download,
Eye,
EyeOff,
Folder,
@ -80,6 +81,7 @@ interface FolderNodeProps {
isWatched?: boolean;
onRescan?: (folder: FolderDisplay) => void | Promise<void>;
onStopWatching?: (folder: FolderDisplay) => void;
onExportFolder?: (folder: FolderDisplay) => void;
}
function getDropZone(
@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({
isWatched,
onRescan,
onStopWatching,
onExportFolder,
}: FolderNodeProps) {
const [renameValue, setRenameValue] = useState(folder.name);
const inputRef = useRef<HTMLInputElement>(null);
@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({
<Move className="mr-2 h-4 w-4" />
Move to...
</DropdownMenuItem>
{onExportFolder && (
<DropdownMenuItem
onClick={(e) => {
e.stopPropagation();
onExportFolder(folder);
}}
>
<Download className="mr-2 h-4 w-4" />
Export folder
</DropdownMenuItem>
)}
<DropdownMenuItem
onClick={(e) => {
e.stopPropagation();
@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({
<Move className="mr-2 h-4 w-4" />
Move to...
</ContextMenuItem>
{onExportFolder && (
<ContextMenuItem onClick={() => onExportFolder(folder)}>
<Download className="mr-2 h-4 w-4" />
Export folder
</ContextMenuItem>
)}
<ContextMenuItem onClick={() => onDelete(folder)}>
<Trash2 className="mr-2 h-4 w-4" />
Delete

View file

@ -44,6 +44,7 @@ interface FolderTreeViewProps {
watchedFolderIds?: Set<number>;
onRescanFolder?: (folder: FolderDisplay) => void;
onStopWatchingFolder?: (folder: FolderDisplay) => void;
onExportFolder?: (folder: FolderDisplay) => void;
}
function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
@ -81,6 +82,7 @@ export function FolderTreeView({
watchedFolderIds,
onRescanFolder,
onStopWatchingFolder,
onExportFolder,
}: FolderTreeViewProps) {
const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
@ -259,6 +261,7 @@ export function FolderTreeView({
isWatched={watchedFolderIds?.has(f.id)}
onRescan={onRescanFolder}
onStopWatching={onStopWatchingFolder}
onExportFolder={onExportFolder}
/>
);

View file

@ -406,6 +406,160 @@ export function DocumentsSidebar({
setFolderPickerOpen(true);
}, []);
const [isExportingKB, setIsExportingKB] = useState(false);
const [exportWarningOpen, setExportWarningOpen] = useState(false);
const [exportWarningContext, setExportWarningContext] = useState<{
type: "kb" | "folder";
folder?: FolderDisplay;
pendingCount: number;
} | null>(null);
const pendingDocuments = useMemo(
() =>
treeDocuments.filter(
(d) => d.status?.state === "pending" || d.status?.state === "processing"
),
[treeDocuments]
);
const doExport = useCallback(async (url: string, downloadName: string) => {
const response = await authenticatedFetch(url, { method: "GET" });
if (!response.ok) {
const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
throw new Error(errorData.detail || "Export failed");
}
const blob = await response.blob();
const blobUrl = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = blobUrl;
a.download = downloadName;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(blobUrl);
}, []);
const handleExportKB = useCallback(async () => {
if (isExportingKB) return;
if (pendingDocuments.length > 0) {
setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length });
setExportWarningOpen(true);
return;
}
setIsExportingKB(true);
try {
await doExport(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
"knowledge-base.zip"
);
toast.success("Knowledge base exported");
} catch (err) {
console.error("KB export failed:", err);
toast.error(err instanceof Error ? err.message : "Export failed");
} finally {
setIsExportingKB(false);
}
}, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]);
const handleExportWarningConfirm = useCallback(async () => {
setExportWarningOpen(false);
const ctx = exportWarningContext;
if (!ctx) return;
if (ctx.type === "kb") {
setIsExportingKB(true);
try {
await doExport(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
"knowledge-base.zip"
);
toast.success("Knowledge base exported");
} catch (err) {
console.error("KB export failed:", err);
toast.error(err instanceof Error ? err.message : "Export failed");
} finally {
setIsExportingKB(false);
}
} else if (ctx.type === "folder" && ctx.folder) {
setIsExportingKB(true);
try {
const safeName =
ctx.folder.name
.replace(/[^a-zA-Z0-9 _-]/g, "_")
.trim()
.slice(0, 80) || "folder";
await doExport(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`,
`${safeName}.zip`
);
toast.success(`Folder "${ctx.folder.name}" exported`);
} catch (err) {
console.error("Folder export failed:", err);
toast.error(err instanceof Error ? err.message : "Export failed");
} finally {
setIsExportingKB(false);
}
}
setExportWarningContext(null);
}, [exportWarningContext, searchSpaceId, doExport]);
const getPendingCountInSubtree = useCallback(
(folderId: number): number => {
const subtreeIds = new Set<number>();
function collect(id: number) {
subtreeIds.add(id);
for (const child of foldersByParent[String(id)] ?? []) {
collect(child.id);
}
}
collect(folderId);
return treeDocuments.filter(
(d) =>
subtreeIds.has(d.folderId ?? -1) &&
(d.status?.state === "pending" || d.status?.state === "processing")
).length;
},
[foldersByParent, treeDocuments]
);
const handleExportFolder = useCallback(
async (folder: FolderDisplay) => {
const folderPendingCount = getPendingCountInSubtree(folder.id);
if (folderPendingCount > 0) {
setExportWarningContext({
type: "folder",
folder,
pendingCount: folderPendingCount,
});
setExportWarningOpen(true);
return;
}
setIsExportingKB(true);
try {
const safeName =
folder.name
.replace(/[^a-zA-Z0-9 _-]/g, "_")
.trim()
.slice(0, 80) || "folder";
await doExport(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
`${safeName}.zip`
);
toast.success(`Folder "${folder.name}" exported`);
} catch (err) {
console.error("Folder export failed:", err);
toast.error(err instanceof Error ? err.message : "Export failed");
} finally {
setIsExportingKB(false);
}
},
[searchSpaceId, getPendingCountInSubtree, doExport]
);
const handleExportDocument = useCallback(
async (doc: DocumentNodeDoc, format: string) => {
const safeTitle =
@ -800,6 +954,8 @@ export function DocumentsSidebar({
onToggleType={onToggleType}
activeTypes={activeTypes}
onCreateFolder={() => handleCreateFolder(null)}
onExportKB={handleExportKB}
isExporting={isExportingKB}
/>
</div>
@ -855,6 +1011,7 @@ export function DocumentsSidebar({
watchedFolderIds={watchedFolderIds}
onRescanFolder={handleRescanFolder}
onStopWatchingFolder={handleStopWatching}
onExportFolder={handleExportFolder}
/>
</div>
</div>
@ -933,6 +1090,33 @@ export function DocumentsSidebar({
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
<AlertDialog
open={exportWarningOpen}
onOpenChange={(open) => {
if (!open) {
setExportWarningOpen(false);
setExportWarningContext(null);
}
}}
>
<AlertDialogContent>
<AlertDialogHeader>
<AlertDialogTitle>Some documents are still processing</AlertDialogTitle>
<AlertDialogDescription>
{exportWarningContext?.pendingCount} document
{exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed
and will be excluded from the export. Do you want to continue?
</AlertDialogDescription>
</AlertDialogHeader>
<AlertDialogFooter>
<AlertDialogCancel>Cancel</AlertDialogCancel>
<AlertDialogAction onClick={handleExportWarningConfirm}>
Export anyway
</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
</>
);

View file

@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress";
import { Spinner } from "@/components/ui/spinner";
import { Switch } from "@/components/ui/switch";
import { useElectronAPI } from "@/hooks/use-platform";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import {
trackDocumentUploadFailure,
trackDocumentUploadStarted,
@ -48,6 +49,77 @@ interface FileWithId {
file: File;
}
interface FolderEntry {
id: string;
file: File;
relativePath: string;
}
interface FolderUploadData {
folderName: string;
entries: FolderEntry[];
}
interface FolderTreeNode {
name: string;
isFolder: boolean;
size?: number;
children: FolderTreeNode[];
}
function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] {
const root: FolderTreeNode = { name: "", isFolder: true, children: [] };
for (const entry of entries) {
const parts = entry.relativePath.split("/");
let current = root;
for (let i = 0; i < parts.length - 1; i++) {
let child = current.children.find((c) => c.name === parts[i] && c.isFolder);
if (!child) {
child = { name: parts[i], isFolder: true, children: [] };
current.children.push(child);
}
current = child;
}
current.children.push({
name: parts[parts.length - 1],
isFolder: false,
size: entry.file.size,
children: [],
});
}
function sortNodes(node: FolderTreeNode) {
node.children.sort((a, b) => {
if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1;
return a.name.localeCompare(b.name);
});
for (const child of node.children) sortNodes(child);
}
sortNodes(root);
return root.children;
}
function flattenTree(
nodes: FolderTreeNode[],
depth = 0
): { name: string; isFolder: boolean; depth: number; size?: number }[] {
const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = [];
for (const node of nodes) {
items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size });
if (node.isFolder && node.children.length > 0) {
items.push(...flattenTree(node.children, depth + 1));
}
}
return items;
}
const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024;
const FOLDER_BATCH_MAX_FILES = 10;
const MAX_FILE_SIZE_MB = 500;
const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
@ -64,11 +136,14 @@ export function DocumentUploadTab({
const [uploadProgress, setUploadProgress] = useState(0);
const [accordionValue, setAccordionValue] = useState<string>("");
const [shouldSummarize, setShouldSummarize] = useState(false);
const [useVisionLlm, setUseVisionLlm] = useState(false);
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
const fileInputRef = useRef<HTMLInputElement>(null);
const folderInputRef = useRef<HTMLInputElement>(null);
const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
const [folderUpload, setFolderUpload] = useState<FolderUploadData | null>(null);
const [isFolderUploading, setIsFolderUploading] = useState(false);
useEffect(() => {
return () => {
@ -105,6 +180,7 @@ export function DocumentUploadTab({
const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
if (valid.length === 0) return;
setFolderUpload(null);
setFiles((prev) => {
const newEntries = valid.map((f) => ({
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
@ -159,6 +235,7 @@ export function DocumentUploadTab({
file: new File([fd.data], fd.name, { type: fd.mimeType }),
})
);
setFolderUpload(null);
setFiles((prev) => [...prev, ...newFiles]);
}, [electronAPI, supportedExtensionsSet, t]);
@ -167,18 +244,35 @@ export function DocumentUploadTab({
const fileList = e.target.files;
if (!fileList || fileList.length === 0) return;
const folderFiles = Array.from(fileList).filter((f) => {
const allFiles = Array.from(fileList);
const firstPath = allFiles[0]?.webkitRelativePath || "";
const folderName = firstPath.split("/")[0];
if (!folderName) {
addFiles(allFiles);
e.target.value = "";
return;
}
const entries: FolderEntry[] = allFiles
.filter((f) => {
const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
return ext !== "" && supportedExtensionsSet.has(ext);
});
})
.map((f) => ({
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
file: f,
relativePath: f.webkitRelativePath.substring(folderName.length + 1),
}));
if (folderFiles.length === 0) {
if (entries.length === 0) {
toast.error(t("no_supported_files_in_folder"));
e.target.value = "";
return;
}
addFiles(folderFiles);
setFiles([]);
setFolderUpload({ folderName, entries });
e.target.value = "";
},
[addFiles, supportedExtensionsSet, t]
@ -192,9 +286,18 @@ export function DocumentUploadTab({
return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
};
const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
const totalFileSize = folderUpload
? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0)
: files.reduce((total, entry) => total + entry.file.size, 0);
const hasContent = files.length > 0;
const fileCount = folderUpload ? folderUpload.entries.length : files.length;
const hasContent = files.length > 0 || folderUpload !== null;
const isAnyUploading = isUploading || isFolderUploading;
const folderTreeItems = useMemo(() => {
if (!folderUpload) return [];
return flattenTree(buildFolderTree(folderUpload.entries));
}, [folderUpload]);
const handleAccordionChange = useCallback(
(value: string) => {
@ -204,7 +307,95 @@ export function DocumentUploadTab({
[onAccordionStateChange]
);
const handleFolderUpload = async () => {
if (!folderUpload) return;
setUploadProgress(0);
setIsFolderUploading(true);
const total = folderUpload.entries.length;
trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize);
try {
const batches: FolderEntry[][] = [];
let currentBatch: FolderEntry[] = [];
let currentSize = 0;
for (const entry of folderUpload.entries) {
const size = entry.file.size;
if (size >= FOLDER_BATCH_SIZE_BYTES) {
if (currentBatch.length > 0) {
batches.push(currentBatch);
currentBatch = [];
currentSize = 0;
}
batches.push([entry]);
continue;
}
if (
currentBatch.length >= FOLDER_BATCH_MAX_FILES ||
currentSize + size > FOLDER_BATCH_SIZE_BYTES
) {
batches.push(currentBatch);
currentBatch = [];
currentSize = 0;
}
currentBatch.push(entry);
currentSize += size;
}
if (currentBatch.length > 0) {
batches.push(currentBatch);
}
let rootFolderId: number | null = null;
let uploaded = 0;
for (const batch of batches) {
const result = await documentsApiService.folderUploadFiles(
batch.map((e) => e.file),
{
folder_name: folderUpload.folderName,
search_space_id: Number(searchSpaceId),
relative_paths: batch.map((e) => e.relativePath),
root_folder_id: rootFolderId,
enable_summary: shouldSummarize,
use_vision_llm: useVisionLlm,
}
);
if (result.root_folder_id && !rootFolderId) {
rootFolderId = result.root_folder_id;
}
uploaded += batch.length;
setUploadProgress(Math.round((uploaded / total) * 100));
}
trackDocumentUploadSuccess(Number(searchSpaceId), total);
toast(t("upload_initiated"), { description: t("upload_initiated_desc") });
setFolderUpload(null);
onSuccess?.();
} catch (error) {
const message = error instanceof Error ? error.message : "Upload failed";
trackDocumentUploadFailure(Number(searchSpaceId), message);
toast(t("upload_error"), {
description: `${t("upload_error_desc")}: ${message}`,
});
} finally {
setIsFolderUploading(false);
setUploadProgress(0);
}
};
const handleUpload = async () => {
if (folderUpload) {
await handleFolderUpload();
return;
}
setUploadProgress(0);
trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
@ -218,6 +409,7 @@ export function DocumentUploadTab({
files: rawFiles,
search_space_id: Number(searchSpaceId),
should_summarize: shouldSummarize,
use_vision_llm: useVisionLlm,
},
{
onSuccess: () => {
@ -341,17 +533,24 @@ export function DocumentUploadTab({
</button>
)
) : (
<button
type="button"
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
<div
role="button"
tabIndex={0}
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
onClick={() => {
if (!isElectron) fileInputRef.current?.click();
}}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
if (!isElectron) fileInputRef.current?.click();
}
}}
>
<Upload className="h-10 w-10 text-muted-foreground" />
<div className="text-center space-y-1.5">
<p className="text-base font-medium">
{isElectron ? "Select files or folder" : "Tap to select files or folder"}
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
</p>
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
</div>
@ -362,7 +561,7 @@ export function DocumentUploadTab({
>
{renderBrowseButton({ fullWidth: true })}
</fieldset>
</button>
</div>
)}
</div>
@ -398,27 +597,64 @@ export function DocumentUploadTab({
</div>
{/* FILES SELECTED */}
{files.length > 0 && (
{hasContent && (
<div className="rounded-lg border border-border p-3 space-y-2">
<div className="flex items-center justify-between">
<p className="text-sm font-medium">
{folderUpload ? (
<>
<FolderOpen className="inline h-4 w-4 mr-1 -mt-0.5" />
{folderUpload.folderName}
<Dot className="inline h-4 w-4" />
{folderUpload.entries.length}{" "}
{folderUpload.entries.length === 1 ? "file" : "files"}
<Dot className="inline h-4 w-4" />
{formatFileSize(totalFileSize)}
</>
) : (
<>
{t("selected_files", { count: files.length })}
<Dot className="inline h-4 w-4" />
{formatFileSize(totalFileSize)}
</>
)}
</p>
<Button
variant="ghost"
size="sm"
className="h-7 text-xs text-muted-foreground hover:text-foreground"
onClick={() => setFiles([])}
disabled={isUploading}
onClick={() => {
setFiles([]);
setFolderUpload(null);
}}
disabled={isAnyUploading}
>
{t("clear_all")}
</Button>
</div>
<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
{files.map((entry) => (
{folderUpload
? folderTreeItems.map((item, i) => (
<div
key={`${item.depth}-${i}-${item.name}`}
className="flex items-center gap-1.5 py-0.5 px-2"
style={{ paddingLeft: `${item.depth * 16 + 8}px` }}
>
{item.isFolder ? (
<FolderOpen className="h-3.5 w-3.5 text-blue-400 shrink-0" />
) : (
<FileIcon className="h-3.5 w-3.5 text-muted-foreground shrink-0" />
)}
<span className="text-sm truncate flex-1 min-w-0">{item.name}</span>
{!item.isFolder && item.size != null && (
<span className="text-xs text-muted-foreground shrink-0">
{formatFileSize(item.size)}
</span>
)}
</div>
))
: files.map((entry) => (
<div
key={entry.id}
className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
@ -435,7 +671,7 @@ export function DocumentUploadTab({
size="icon"
className="h-6 w-6 shrink-0"
onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
disabled={isUploading}
disabled={isAnyUploading}
>
<X className="h-3 w-3" />
</Button>
@ -443,10 +679,10 @@ export function DocumentUploadTab({
))}
</div>
{isUploading && (
{isAnyUploading && (
<div className="space-y-1">
<div className="flex items-center justify-between text-xs">
<span>{t("uploading_files")}</span>
<span>{folderUpload ? t("uploading_folder") : t("uploading_files")}</span>
<span>{Math.round(uploadProgress)}%</span>
</div>
<Progress value={uploadProgress} className="h-1.5" />
@ -463,19 +699,31 @@ export function DocumentUploadTab({
<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
</div>
<div className={toggleRowClass}>
<div className="space-y-0.5">
<p className="font-medium text-sm">Enable Vision LLM</p>
<p className="text-xs text-muted-foreground">
Describes images using AI vision (costly, slower)
</p>
</div>
<Switch checked={useVisionLlm} onCheckedChange={setUseVisionLlm} />
</div>
<Button
className="w-full"
onClick={handleUpload}
disabled={isUploading || files.length === 0}
disabled={isAnyUploading || fileCount === 0}
>
{isUploading ? (
{isAnyUploading ? (
<span className="flex items-center gap-2">
<Spinner size="sm" />
{t("uploading")}
</span>
) : (
<span className="flex items-center gap-2">
{t("upload_button", { count: files.length })}
{folderUpload
? t("upload_folder_button", { count: fileCount })
: t("upload_button", { count: fileCount })}
</span>
)}
</Button>

View file

@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({
last_indexed_at: z.string().nullable(),
config: z.record(z.string(), z.any()),
enable_summary: z.boolean().default(false),
enable_vision_llm: z.boolean().default(false),
periodic_indexing_enabled: z.boolean(),
indexing_frequency_minutes: z.number().nullable(),
next_scheduled_at: z.string().nullable(),
@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({
last_indexed_at: true,
config: true,
enable_summary: true,
enable_vision_llm: true,
periodic_indexing_enabled: true,
indexing_frequency_minutes: true,
next_scheduled_at: true,
@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({
last_indexed_at: true,
config: true,
enable_summary: true,
enable_vision_llm: true,
periodic_indexing_enabled: true,
indexing_frequency_minutes: true,
next_scheduled_at: true,

View file

@ -148,6 +148,7 @@ export const uploadDocumentRequest = z.object({
files: z.array(z.instanceof(File)),
search_space_id: z.number(),
should_summarize: z.boolean().default(false),
use_vision_llm: z.boolean().default(false),
});
export const uploadDocumentResponse = z.object({

View file

@ -127,7 +127,7 @@ class DocumentsApiService {
throw new ValidationError(`Invalid request: ${errorMessage}`);
}
const { files, search_space_id, should_summarize } = parsedRequest.data;
const { files, search_space_id, should_summarize, use_vision_llm } = parsedRequest.data;
const UPLOAD_BATCH_SIZE = 5;
const batches: File[][] = [];
@ -146,6 +146,7 @@ class DocumentsApiService {
for (const file of batch) formData.append("files", file);
formData.append("search_space_id", String(search_space_id));
formData.append("should_summarize", String(should_summarize));
formData.append("use_vision_llm", String(use_vision_llm));
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 120_000);
@ -442,6 +443,7 @@ class DocumentsApiService {
relative_paths: string[];
root_folder_id?: number | null;
enable_summary?: boolean;
use_vision_llm?: boolean;
},
signal?: AbortSignal
): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
@ -456,6 +458,7 @@ class DocumentsApiService {
formData.append("root_folder_id", String(metadata.root_folder_id));
}
formData.append("enable_summary", String(metadata.enable_summary ?? false));
formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false));
const totalSize = files.reduce((acc, f) => acc + f.size, 0);
const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);

View file

@ -396,7 +396,11 @@
"supported_file_types": "Supported File Types",
"file_too_large": "File Too Large",
"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
"no_supported_files_in_folder": "No supported file types found in the selected folder."
"no_supported_files_in_folder": "No supported file types found in the selected folder.",
"uploading_folder": "Uploading folder…",
"upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})",
"select_files_or_folder": "Select files or folder",
"tap_select_files_or_folder": "Tap to select files or folder"
},
"add_webpage": {
"title": "Add Webpages for Crawling",

View file

@ -396,7 +396,11 @@
"supported_file_types": "Tipos de archivo soportados",
"file_too_large": "Archivo demasiado grande",
"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.",
"uploading_folder": "Subiendo carpeta…",
"upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})",
"select_files_or_folder": "Seleccionar archivos o carpeta",
"tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta"
},
"add_webpage": {
"title": "Agregar páginas web para rastreo",

View file

@ -396,7 +396,11 @@
"supported_file_types": "समर्थित फ़ाइल प्रकार",
"file_too_large": "फ़ाइल बहुत बड़ी है",
"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।",
"uploading_folder": "फ़ोल्डर अपलोड हो रहा है…",
"upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})",
"select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें",
"tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें"
},
"add_webpage": {
"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",

View file

@ -396,7 +396,11 @@
"supported_file_types": "Tipos de arquivo suportados",
"file_too_large": "Arquivo muito grande",
"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.",
"uploading_folder": "Enviando pasta…",
"upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})",
"select_files_or_folder": "Selecionar arquivos ou pasta",
"tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta"
},
"add_webpage": {
"title": "Adicionar páginas web para rastreamento",

View file

@ -380,7 +380,11 @@
"supported_file_types": "支持的文件类型",
"file_too_large": "文件过大",
"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。",
"uploading_folder": "正在上传文件夹…",
"upload_folder_button": "上传文件夹({count}个文件)",
"select_files_or_folder": "选择文件或文件夹",
"tap_select_files_or_folder": "点击选择文件或文件夹"
},
"add_webpage": {
"title": "添加网页爬取",