diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py index dfae38f66..e89800191 100644 --- a/surfsense_backend/app/connectors/dropbox/client.py +++ b/surfsense_backend/app/connectors/dropbox/client.py @@ -225,6 +225,55 @@ class DropboxClient: return all_items, None + async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]: + """Get a cursor representing the current state of a folder. + + Uses /2/files/list_folder/get_latest_cursor so we can later call + get_changes to receive only incremental updates. + """ + resp = await self._request( + "/2/files/list_folder/get_latest_cursor", + {"path": path, "recursive": False, "include_non_downloadable_files": True}, + ) + if resp.status_code != 200: + return None, f"Failed to get cursor: {resp.status_code} - {resp.text}" + return resp.json().get("cursor"), None + + async def get_changes( + self, cursor: str + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """Fetch incremental changes since the given cursor. + + Calls /2/files/list_folder/continue and handles pagination. + Returns (entries, new_cursor, error). + """ + all_entries: list[dict[str, Any]] = [] + + resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor}) + if resp.status_code == 401: + return [], None, "Dropbox authentication expired (401)" + if resp.status_code != 200: + return [], None, f"Failed to get changes: {resp.status_code} - {resp.text}" + + data = resp.json() + all_entries.extend(data.get("entries", [])) + + while data.get("has_more"): + cursor = data["cursor"] + resp = await self._request( + "/2/files/list_folder/continue", {"cursor": cursor} + ) + if resp.status_code != 200: + return ( + all_entries, + data.get("cursor"), + f"Pagination failed: {resp.status_code}", + ) + data = resp.json() + all_entries.extend(data.get("entries", [])) + + return all_entries, data.get("cursor"), None + async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]: resp = await self._request("/2/files/get_metadata", {"path": path}) if resp.status_code != 200: diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index e89893b14..8cbc3e417 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -53,7 +53,8 @@ async def download_and_extract_content( file_name = file.get("name", "Unknown") file_id = file.get("id", "") - if should_skip_file(file): + skip, _unsup_ext = should_skip_file(file) + if skip: return None, {}, "Skipping non-indexable item" logger.info(f"Downloading file for content extraction: {file_name}") @@ -87,9 +88,13 @@ async def download_and_extract_content( if error: return None, metadata, error - from app.connectors.onedrive.content_extractor import _parse_file_to_markdown + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + result = await EtlPipelineService().extract( + EtlRequest(file_path=temp_file_path, filename=file_name) + ) + markdown = result.markdown_content return markdown, metadata, None except Exception as e: diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py index e6d772a1c..d26306665 100644 --- a/surfsense_backend/app/connectors/dropbox/file_types.py +++ b/surfsense_backend/app/connectors/dropbox/file_types.py @@ -1,8 +1,8 @@ """File type handlers for Dropbox.""" -PAPER_EXTENSION = ".paper" +from app.etl_pipeline.file_classifier import should_skip_for_service -SKIP_EXTENSIONS: frozenset[str] = frozenset() +PAPER_EXTENSION = ".paper" MIME_TO_EXTENSION: dict[str, str] = { "application/pdf": ".pdf", @@ -42,17 +42,25 @@ def is_paper_file(item: dict) -> bool: return ext == PAPER_EXTENSION -def should_skip_file(item: dict) -> bool: +def should_skip_file(item: dict) -> tuple[bool, str | None]: """Skip folders and truly non-indexable files. Paper docs are non-downloadable but exportable, so they are NOT skipped. + Returns (should_skip, unsupported_extension_or_None). """ if is_folder(item): - return True + return True, None if is_paper_file(item): - return False + return False, None if not item.get("is_downloadable", True): - return True + return True, None + + from pathlib import PurePosixPath + + from app.config import config as app_config + name = item.get("name", "") - ext = get_extension_from_name(name).lower() - return ext in SKIP_EXTENSIONS + if should_skip_for_service(name, app_config.ETL_SERVICE): + ext = PurePosixPath(name).suffix.lower() + return True, ext + return False, None diff --git a/surfsense_backend/app/connectors/dropbox/folder_manager.py b/surfsense_backend/app/connectors/dropbox/folder_manager.py index 5453c8785..f9aa78873 100644 --- a/surfsense_backend/app/connectors/dropbox/folder_manager.py +++ b/surfsense_backend/app/connectors/dropbox/folder_manager.py @@ -64,8 +64,10 @@ async def get_files_in_folder( ) continue files.extend(sub_files) - elif not should_skip_file(item): - files.append(item) + else: + skip, _unsup_ext = should_skip_file(item) + if not skip: + files.append(item) return files, None diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 1e94133b4..83ff32e82 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -1,12 +1,9 @@ """Content extraction for Google Drive files.""" -import asyncio import contextlib import logging import os import tempfile -import threading -import time from pathlib import Path from typing import Any @@ -20,6 +17,7 @@ from .file_types import ( get_export_mime_type, get_extension_from_mime, is_google_workspace_file, + should_skip_by_extension, should_skip_file, ) @@ -45,6 +43,11 @@ async def download_and_extract_content( if should_skip_file(mime_type): return None, {}, f"Skipping {mime_type}" + if not is_google_workspace_file(mime_type): + ext_skip, _unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return None, {}, f"Skipping unsupported extension: {file_name}" + logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})") drive_metadata: dict[str, Any] = { @@ -97,7 +100,10 @@ async def download_and_extract_content( if error: return None, drive_metadata, error - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) + markdown = await _parse_file_to_markdown(temp_file_path, etl_filename) return markdown, drive_metadata, None except Exception as e: @@ -110,99 +116,14 @@ async def download_and_extract_content( async def _parse_file_to_markdown(file_path: str, filename: str) -> str: - """Parse a local file to markdown using the configured ETL service.""" - lower = filename.lower() + """Parse a local file to markdown using the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - if lower.endswith((".md", ".markdown", ".txt")): - with open(file_path, encoding="utf-8") as f: - return f.read() - - if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")): - from litellm import atranscription - - from app.config import config as app_config - - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - if stt_service_type == "local": - from app.services.stt_service import stt_service - - t0 = time.monotonic() - logger.info( - f"[local-stt] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(stt_service.transcribe_file, file_path) - logger.info( - f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - text = result.get("text", "") - else: - with open(file_path, "rb") as audio_file: - kwargs: dict[str, Any] = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - resp = await atranscription(**kwargs) - text = resp.get("text", "") - - if not text: - raise ValueError("Transcription returned empty text") - return f"# Transcription of {filename}\n\n{text}" - - # Document files -- use configured ETL service - from app.config import config as app_config - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - from langchain_unstructured import UnstructuredLoader - - from app.utils.document_converters import convert_document_to_markdown - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - return await convert_document_to_markdown(docs) - - if app_config.ETL_SERVICE == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import ( - parse_with_llamacloud_retry, - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, estimated_pages=50 - ) - markdown_documents = await result.aget_markdown_documents(split_by_page=False) - if not markdown_documents: - raise RuntimeError(f"LlamaCloud returned no documents for {filename}") - return markdown_documents[0].text - - if app_config.ETL_SERVICE == "DOCLING": - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - t0 = time.monotonic() - logger.info( - f"[docling] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(converter.convert, file_path) - logger.info( - f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - return result.document.export_to_markdown() - - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) + ) + return result.markdown_content async def download_and_process_file( @@ -236,10 +157,14 @@ async def download_and_process_file( file_name = file.get("name", "Unknown") mime_type = file.get("mimeType", "") - # Skip folders and shortcuts if should_skip_file(mime_type): return None, f"Skipping {mime_type}", None + if not is_google_workspace_file(mime_type): + ext_skip, _unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return None, f"Skipping unsupported extension: {file_name}", None + logger.info(f"Downloading file: {file_name} ({mime_type})") temp_file_path = None @@ -310,10 +235,13 @@ async def download_and_process_file( "." )[-1] + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) logger.info(f"Processing {file_name} with Surfsense's file processor") await process_file_in_background( file_path=temp_file_path, - filename=file_name, + filename=etl_filename, search_space_id=search_space_id, user_id=user_id, session=session, diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index dd6aff4d7..75dc1d4b3 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -1,5 +1,7 @@ """File type handlers for Google Drive.""" +from app.etl_pipeline.file_classifier import should_skip_for_service + GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" GOOGLE_SLIDE = "application/vnd.google-apps.presentation" @@ -46,6 +48,21 @@ def should_skip_file(mime_type: str) -> bool: return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT] +def should_skip_by_extension(filename: str) -> tuple[bool, str | None]: + """Check if the file extension is not parseable by the configured ETL service. + + Returns (should_skip, unsupported_extension_or_None). + """ + from pathlib import PurePosixPath + + from app.config import config as app_config + + if should_skip_for_service(filename, app_config.ETL_SERVICE): + ext = PurePosixPath(filename).suffix.lower() + return True, ext + return False, None + + def get_export_mime_type(mime_type: str) -> str | None: """Get export MIME type for Google Workspace files.""" return EXPORT_FORMATS.get(mime_type) diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py index 8917ba1fd..2238b8603 100644 --- a/surfsense_backend/app/connectors/onedrive/content_extractor.py +++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py @@ -1,16 +1,9 @@ -"""Content extraction for OneDrive files. +"""Content extraction for OneDrive files.""" -Reuses the same ETL parsing logic as Google Drive since file parsing is -extension-based, not provider-specific. -""" - -import asyncio import contextlib import logging import os import tempfile -import threading -import time from pathlib import Path from typing import Any @@ -31,7 +24,8 @@ async def download_and_extract_content( item_id = file.get("id") file_name = file.get("name", "Unknown") - if should_skip_file(file): + skip, _unsup_ext = should_skip_file(file) + if skip: return None, {}, "Skipping non-indexable item" file_info = file.get("file", {}) @@ -84,98 +78,11 @@ async def download_and_extract_content( async def _parse_file_to_markdown(file_path: str, filename: str) -> str: - """Parse a local file to markdown using the configured ETL service. + """Parse a local file to markdown using the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - Same logic as Google Drive -- file parsing is extension-based. - """ - lower = filename.lower() - - if lower.endswith((".md", ".markdown", ".txt")): - with open(file_path, encoding="utf-8") as f: - return f.read() - - if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")): - from litellm import atranscription - - from app.config import config as app_config - - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - if stt_service_type == "local": - from app.services.stt_service import stt_service - - t0 = time.monotonic() - logger.info( - f"[local-stt] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(stt_service.transcribe_file, file_path) - logger.info( - f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - text = result.get("text", "") - else: - with open(file_path, "rb") as audio_file: - kwargs: dict[str, Any] = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - resp = await atranscription(**kwargs) - text = resp.get("text", "") - - if not text: - raise ValueError("Transcription returned empty text") - return f"# Transcription of {filename}\n\n{text}" - - from app.config import config as app_config - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - from langchain_unstructured import UnstructuredLoader - - from app.utils.document_converters import convert_document_to_markdown - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - return await convert_document_to_markdown(docs) - - if app_config.ETL_SERVICE == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import ( - parse_with_llamacloud_retry, - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, estimated_pages=50 - ) - markdown_documents = await result.aget_markdown_documents(split_by_page=False) - if not markdown_documents: - raise RuntimeError(f"LlamaCloud returned no documents for {filename}") - return markdown_documents[0].text - - if app_config.ETL_SERVICE == "DOCLING": - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - t0 = time.monotonic() - logger.info( - f"[docling] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(converter.convert, file_path) - logger.info( - f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - return result.document.export_to_markdown() - - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) + ) + return result.markdown_content diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py index 403fdc337..942b0be73 100644 --- a/surfsense_backend/app/connectors/onedrive/file_types.py +++ b/surfsense_backend/app/connectors/onedrive/file_types.py @@ -1,5 +1,7 @@ """File type handlers for Microsoft OneDrive.""" +from app.etl_pipeline.file_classifier import should_skip_for_service + ONEDRIVE_FOLDER_FACET = "folder" ONENOTE_MIME = "application/msonenote" @@ -38,13 +40,28 @@ def is_folder(item: dict) -> bool: return ONEDRIVE_FOLDER_FACET in item -def should_skip_file(item: dict) -> bool: - """Skip folders, OneNote files, remote items (shared links), and packages.""" +def should_skip_file(item: dict) -> tuple[bool, str | None]: + """Skip folders, OneNote files, remote items, packages, and unsupported extensions. + + Returns (should_skip, unsupported_extension_or_None). + The second element is only set when the skip is due to an unsupported extension. + """ if is_folder(item): - return True + return True, None if "remoteItem" in item: - return True + return True, None if "package" in item: - return True + return True, None mime = item.get("file", {}).get("mimeType", "") - return mime in SKIP_MIME_TYPES + if mime in SKIP_MIME_TYPES: + return True, None + + from pathlib import PurePosixPath + + from app.config import config as app_config + + name = item.get("name", "") + if should_skip_for_service(name, app_config.ETL_SERVICE): + ext = PurePosixPath(name).suffix.lower() + return True, ext + return False, None diff --git a/surfsense_backend/app/connectors/onedrive/folder_manager.py b/surfsense_backend/app/connectors/onedrive/folder_manager.py index 6fa725ca1..a5d7fa713 100644 --- a/surfsense_backend/app/connectors/onedrive/folder_manager.py +++ b/surfsense_backend/app/connectors/onedrive/folder_manager.py @@ -71,8 +71,10 @@ async def get_files_in_folder( ) continue files.extend(sub_files) - elif not should_skip_file(item): - files.append(item) + else: + skip, _unsup_ext = should_skip_file(item) + if not skip: + files.append(item) return files, None diff --git a/surfsense_backend/app/etl_pipeline/__init__.py b/surfsense_backend/app/etl_pipeline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/app/etl_pipeline/constants.py b/surfsense_backend/app/etl_pipeline/constants.py new file mode 100644 index 000000000..f65759c13 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/constants.py @@ -0,0 +1,39 @@ +import ssl + +import httpx + +LLAMACLOUD_MAX_RETRIES = 5 +LLAMACLOUD_BASE_DELAY = 10 +LLAMACLOUD_MAX_DELAY = 120 +LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( + ssl.SSLError, + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadError, + httpx.ReadTimeout, + httpx.WriteError, + httpx.WriteTimeout, + httpx.RemoteProtocolError, + httpx.LocalProtocolError, + ConnectionError, + ConnectionResetError, + TimeoutError, + OSError, +) + +UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024 +MIN_UPLOAD_TIMEOUT = 120 +MAX_UPLOAD_TIMEOUT = 1800 +BASE_JOB_TIMEOUT = 600 +PER_PAGE_JOB_TIMEOUT = 60 + + +def calculate_upload_timeout(file_size_bytes: int) -> float: + estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 + return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) + + +def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: + page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) + size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 + return max(page_based_timeout, size_based_timeout) diff --git a/surfsense_backend/app/etl_pipeline/etl_document.py b/surfsense_backend/app/etl_pipeline/etl_document.py new file mode 100644 index 000000000..350c3299f --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/etl_document.py @@ -0,0 +1,21 @@ +from pydantic import BaseModel, field_validator + + +class EtlRequest(BaseModel): + file_path: str + filename: str + estimated_pages: int = 0 + + @field_validator("filename") + @classmethod + def filename_must_not_be_empty(cls, v: str) -> str: + if not v.strip(): + raise ValueError("filename must not be empty") + return v + + +class EtlResult(BaseModel): + markdown_content: str + etl_service: str + actual_pages: int = 0 + content_type: str diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py new file mode 100644 index 000000000..6e7ab3c4c --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -0,0 +1,90 @@ +from app.config import config as app_config +from app.etl_pipeline.etl_document import EtlRequest, EtlResult +from app.etl_pipeline.exceptions import ( + EtlServiceUnavailableError, + EtlUnsupportedFileError, +) +from app.etl_pipeline.file_classifier import FileCategory, classify_file +from app.etl_pipeline.parsers.audio import transcribe_audio +from app.etl_pipeline.parsers.direct_convert import convert_file_directly +from app.etl_pipeline.parsers.plaintext import read_plaintext + + +class EtlPipelineService: + """Single pipeline for extracting markdown from files. All callers use this.""" + + async def extract(self, request: EtlRequest) -> EtlResult: + category = classify_file(request.filename) + + if category == FileCategory.UNSUPPORTED: + raise EtlUnsupportedFileError( + f"File type not supported for parsing: {request.filename}" + ) + + if category == FileCategory.PLAINTEXT: + content = read_plaintext(request.file_path) + return EtlResult( + markdown_content=content, + etl_service="PLAINTEXT", + content_type="plaintext", + ) + + if category == FileCategory.DIRECT_CONVERT: + content = convert_file_directly(request.file_path, request.filename) + return EtlResult( + markdown_content=content, + etl_service="DIRECT_CONVERT", + content_type="direct_convert", + ) + + if category == FileCategory.AUDIO: + content = await transcribe_audio(request.file_path, request.filename) + return EtlResult( + markdown_content=content, + etl_service="AUDIO", + content_type="audio", + ) + + return await self._extract_document(request) + + async def _extract_document(self, request: EtlRequest) -> EtlResult: + from pathlib import PurePosixPath + + from app.utils.file_extensions import get_document_extensions_for_service + + etl_service = app_config.ETL_SERVICE + if not etl_service: + raise EtlServiceUnavailableError( + "No ETL_SERVICE configured. " + "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" + ) + + ext = PurePosixPath(request.filename).suffix.lower() + supported = get_document_extensions_for_service(etl_service) + if ext not in supported: + raise EtlUnsupportedFileError( + f"File type {ext} is not supported by {etl_service}" + ) + + if etl_service == "DOCLING": + from app.etl_pipeline.parsers.docling import parse_with_docling + + content = await parse_with_docling(request.file_path, request.filename) + elif etl_service == "UNSTRUCTURED": + from app.etl_pipeline.parsers.unstructured import parse_with_unstructured + + content = await parse_with_unstructured(request.file_path) + elif etl_service == "LLAMACLOUD": + from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud + + content = await parse_with_llamacloud( + request.file_path, request.estimated_pages + ) + else: + raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") + + return EtlResult( + markdown_content=content, + etl_service=etl_service, + content_type="document", + ) diff --git a/surfsense_backend/app/etl_pipeline/exceptions.py b/surfsense_backend/app/etl_pipeline/exceptions.py new file mode 100644 index 000000000..26eecbef4 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/exceptions.py @@ -0,0 +1,10 @@ +class EtlParseError(Exception): + """Raised when an ETL parser fails to produce content.""" + + +class EtlServiceUnavailableError(Exception): + """Raised when the configured ETL_SERVICE is not recognised.""" + + +class EtlUnsupportedFileError(Exception): + """Raised when a file type cannot be parsed by any ETL pipeline.""" diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py new file mode 100644 index 000000000..4e690bcdc --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -0,0 +1,137 @@ +from enum import Enum +from pathlib import PurePosixPath + +from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, +) + +PLAINTEXT_EXTENSIONS = frozenset( + { + ".md", + ".markdown", + ".txt", + ".text", + ".json", + ".jsonl", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".xml", + ".css", + ".scss", + ".less", + ".sass", + ".py", + ".pyw", + ".pyi", + ".pyx", + ".js", + ".jsx", + ".ts", + ".tsx", + ".mjs", + ".cjs", + ".java", + ".kt", + ".kts", + ".scala", + ".groovy", + ".c", + ".h", + ".cpp", + ".cxx", + ".cc", + ".hpp", + ".hxx", + ".cs", + ".fs", + ".fsx", + ".go", + ".rs", + ".rb", + ".php", + ".pl", + ".pm", + ".lua", + ".swift", + ".m", + ".mm", + ".r", + ".jl", + ".sh", + ".bash", + ".zsh", + ".fish", + ".bat", + ".cmd", + ".ps1", + ".sql", + ".graphql", + ".gql", + ".env", + ".gitignore", + ".dockerignore", + ".editorconfig", + ".makefile", + ".cmake", + ".log", + ".rst", + ".tex", + ".bib", + ".org", + ".adoc", + ".asciidoc", + ".vue", + ".svelte", + ".astro", + ".tf", + ".hcl", + ".proto", + } +) + +AUDIO_EXTENSIONS = frozenset( + {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"} +) + +DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"}) + + +class FileCategory(Enum): + PLAINTEXT = "plaintext" + AUDIO = "audio" + DIRECT_CONVERT = "direct_convert" + UNSUPPORTED = "unsupported" + DOCUMENT = "document" + + +def classify_file(filename: str) -> FileCategory: + suffix = PurePosixPath(filename).suffix.lower() + if suffix in PLAINTEXT_EXTENSIONS: + return FileCategory.PLAINTEXT + if suffix in AUDIO_EXTENSIONS: + return FileCategory.AUDIO + if suffix in DIRECT_CONVERT_EXTENSIONS: + return FileCategory.DIRECT_CONVERT + if suffix in DOCUMENT_EXTENSIONS: + return FileCategory.DOCUMENT + return FileCategory.UNSUPPORTED + + +def should_skip_for_service(filename: str, etl_service: str | None) -> bool: + """Return True if *filename* cannot be processed by *etl_service*. + + Plaintext, audio, and direct-convert files are parser-agnostic and never + skipped. Document files are checked against the per-parser extension set. + """ + category = classify_file(filename) + if category == FileCategory.UNSUPPORTED: + return True + if category == FileCategory.DOCUMENT: + suffix = PurePosixPath(filename).suffix.lower() + return suffix not in get_document_extensions_for_service(etl_service) + return False diff --git a/surfsense_backend/app/etl_pipeline/parsers/__init__.py b/surfsense_backend/app/etl_pipeline/parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/app/etl_pipeline/parsers/audio.py b/surfsense_backend/app/etl_pipeline/parsers/audio.py new file mode 100644 index 000000000..cd49bafde --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/audio.py @@ -0,0 +1,34 @@ +from litellm import atranscription + +from app.config import config as app_config + + +async def transcribe_audio(file_path: str, filename: str) -> str: + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + text = result.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + else: + with open(file_path, "rb") as audio_file: + kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + response = await atranscription(**kwargs) + text = response.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + + return f"# Transcription of {filename}\n\n{text}" diff --git a/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py new file mode 100644 index 000000000..c9e6e8647 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py @@ -0,0 +1,3 @@ +from app.tasks.document_processors._direct_converters import convert_file_directly + +__all__ = ["convert_file_directly"] diff --git a/surfsense_backend/app/etl_pipeline/parsers/docling.py b/surfsense_backend/app/etl_pipeline/parsers/docling.py new file mode 100644 index 000000000..df0498148 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/docling.py @@ -0,0 +1,26 @@ +import warnings +from logging import ERROR, getLogger + + +async def parse_with_docling(file_path: str, filename: str) -> str: + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document(file_path, filename) + finally: + pdfminer_logger.setLevel(original_level) + + return result["content"] diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py new file mode 100644 index 000000000..ae2a34234 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py @@ -0,0 +1,123 @@ +import asyncio +import logging +import os +import random + +import httpx + +from app.config import config as app_config +from app.etl_pipeline.constants import ( + LLAMACLOUD_BASE_DELAY, + LLAMACLOUD_MAX_DELAY, + LLAMACLOUD_MAX_RETRIES, + LLAMACLOUD_RETRYABLE_EXCEPTIONS, + PER_PAGE_JOB_TIMEOUT, + calculate_job_timeout, + calculate_upload_timeout, +) + + +async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str: + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType + + file_size_bytes = os.path.getsize(file_path) + file_size_mb = file_size_bytes / (1024 * 1024) + + upload_timeout = calculate_upload_timeout(file_size_bytes) + job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) + + custom_timeout = httpx.Timeout( + connect=120.0, + read=upload_timeout, + write=upload_timeout, + pool=120.0, + ) + + logging.info( + f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " + f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " + f"job_timeout={job_timeout:.0f}s" + ) + + last_exception = None + attempt_errors: list[str] = [] + + for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): + try: + async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, + verbose=True, + language="en", + result_type=ResultType.MD, + max_timeout=int(max(2000, job_timeout + upload_timeout)), + job_timeout_in_seconds=job_timeout, + job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, + custom_client=custom_client, + ) + result = await parser.aparse(file_path) + + if attempt > 1: + logging.info( + f"LlamaCloud upload succeeded on attempt {attempt} after " + f"{len(attempt_errors)} failures" + ) + + if hasattr(result, "get_markdown_documents"): + markdown_docs = result.get_markdown_documents(split_by_page=False) + if markdown_docs and hasattr(markdown_docs[0], "text"): + return markdown_docs[0].text + if hasattr(result, "pages") and result.pages: + return "\n\n".join( + p.md for p in result.pages if hasattr(p, "md") and p.md + ) + return str(result) + + if isinstance(result, list): + if result and hasattr(result[0], "text"): + return result[0].text + return "\n\n".join( + doc.page_content if hasattr(doc, "page_content") else str(doc) + for doc in result + ) + + return str(result) + + except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: + last_exception = e + error_type = type(e).__name__ + error_msg = str(e)[:200] + attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") + + if attempt < LLAMACLOUD_MAX_RETRIES: + base_delay = min( + LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), + LLAMACLOUD_MAX_DELAY, + ) + jitter = base_delay * 0.25 * (2 * random.random() - 1) + delay = base_delay + jitter + + logging.warning( + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " + f"{error_type}. File: {file_size_mb:.1f}MB. " + f"Retrying in {delay:.0f}s..." + ) + await asyncio.sleep(delay) + else: + logging.error( + f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} " + f"attempts. File size: {file_size_mb:.1f}MB, " + f"Pages: {estimated_pages}. " + f"Errors: {'; '.join(attempt_errors)}" + ) + + except Exception: + raise + + raise last_exception or RuntimeError( + f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " + f"File size: {file_size_mb:.1f}MB" + ) diff --git a/surfsense_backend/app/etl_pipeline/parsers/plaintext.py b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py new file mode 100644 index 000000000..24bfb71e5 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py @@ -0,0 +1,8 @@ +def read_plaintext(file_path: str) -> str: + with open(file_path, encoding="utf-8", errors="replace") as f: + content = f.read() + if "\x00" in content: + raise ValueError( + f"File contains null bytes — likely a binary file opened as text: {file_path}" + ) + return content diff --git a/surfsense_backend/app/etl_pipeline/parsers/unstructured.py b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py new file mode 100644 index 000000000..af8fb99b6 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py @@ -0,0 +1,14 @@ +async def parse_with_unstructured(file_path: str) -> str: + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + docs = await loader.aload() + return "\n\n".join(doc.page_content for doc in docs if doc.page_content) diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py index bb56709cb..a11b7dbc1 100644 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends from fastapi.responses import StreamingResponse from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession @@ -31,8 +31,11 @@ async def vision_autocomplete_stream( return StreamingResponse( stream_vision_autocomplete( - body.screenshot, body.search_space_id, session, - app_name=body.app_name, window_title=body.window_title, + body.screenshot, + body.search_space_id, + session, + app_name=body.app_name, + window_title=body.window_title, ), media_type="text/event-stream", headers={ diff --git a/surfsense_backend/app/routes/dropbox_add_connector_route.py b/surfsense_backend/app/routes/dropbox_add_connector_route.py index 941e5c00f..1dba64467 100644 --- a/surfsense_backend/app/routes/dropbox_add_connector_route.py +++ b/surfsense_backend/app/routes/dropbox_add_connector_route.py @@ -311,9 +311,11 @@ async def dropbox_callback( ) existing_cursor = db_connector.config.get("cursor") + existing_folder_cursors = db_connector.config.get("folder_cursors") db_connector.config = { **connector_config, "cursor": existing_cursor, + "folder_cursors": existing_folder_cursors, "auth_expired": False, } flag_modified(db_connector, "config") diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index d208ff910..bb20da65d 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -2477,6 +2477,8 @@ async def run_google_drive_indexing( stage="fetching", ) + total_unsupported = 0 + # Index each folder with indexing options for folder in items.folders: try: @@ -2484,6 +2486,7 @@ async def run_google_drive_indexing( indexed_count, skipped_count, error_message, + unsupported_count, ) = await index_google_drive_files( session, connector_id, @@ -2497,6 +2500,7 @@ async def run_google_drive_indexing( include_subfolders=indexing_options.include_subfolders, ) total_skipped += skipped_count + total_unsupported += unsupported_count if error_message: errors.append(f"Folder '{folder.name}': {error_message}") else: @@ -2572,6 +2576,7 @@ async def run_google_drive_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: @@ -2642,7 +2647,12 @@ async def run_onedrive_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message = await index_onedrive_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_onedrive_files( session, connector_id, search_space_id, @@ -2683,6 +2693,7 @@ async def run_onedrive_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: @@ -2750,7 +2761,12 @@ async def run_dropbox_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message = await index_dropbox_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_dropbox_files( session, connector_id, search_space_id, @@ -2791,6 +2807,7 @@ async def run_dropbox_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py index 82eaf7f74..af9a7d2d5 100644 --- a/surfsense_backend/app/services/docling_service.py +++ b/surfsense_backend/app/services/docling_service.py @@ -111,9 +111,8 @@ class DoclingService: pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend ) - # Initialize DocumentConverter self.converter = DocumentConverter( - format_options={InputFormat.PDF: pdf_format_option} + format_options={InputFormat.PDF: pdf_format_option}, ) acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU" diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 5e40a3b42..5ffee12d7 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -421,6 +421,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): error_message: str | None = None, is_warning: bool = False, skipped_count: int | None = None, + unsupported_count: int | None = None, ) -> Notification: """ Update notification when connector indexing completes. @@ -428,10 +429,11 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): Args: session: Database session notification: Notification to update - indexed_count: Total number of items indexed + indexed_count: Total number of files indexed error_message: Error message if indexing failed, or warning message (optional) is_warning: If True, treat error_message as a warning (success case) rather than an error - skipped_count: Number of items skipped (e.g., duplicates) - optional + skipped_count: Number of files skipped (e.g., unchanged) - optional + unsupported_count: Number of files skipped because the ETL parser doesn't support them Returns: Updated notification @@ -440,52 +442,45 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): "connector_name", "Connector" ) - # Build the skipped text if there are skipped items - skipped_text = "" - if skipped_count and skipped_count > 0: - skipped_item_text = "item" if skipped_count == 1 else "items" - skipped_text = ( - f" ({skipped_count} {skipped_item_text} skipped - already indexed)" - ) + unsupported_text = "" + if unsupported_count and unsupported_count > 0: + file_word = "file was" if unsupported_count == 1 else "files were" + unsupported_text = f" {unsupported_count} {file_word} not supported." - # If there's an error message but items were indexed, treat it as a warning (partial success) - # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found) - # Otherwise, treat it as a failure if error_message: if indexed_count > 0: - # Partial success with warnings (e.g., duplicate content from other connectors) title = f"Ready: {connector_name}" - item_text = "item" if indexed_count == 1 else "items" - message = f"Now searchable! {indexed_count} {item_text} synced{skipped_text}. Note: {error_message}" + file_text = "file" if indexed_count == 1 else "files" + message = f"Now searchable! {indexed_count} {file_text} synced.{unsupported_text} Note: {error_message}" status = "completed" elif is_warning: - # Warning case (e.g., duplicates found) - treat as success title = f"Ready: {connector_name}" - message = f"Sync completed{skipped_text}. {error_message}" + message = f"Sync complete.{unsupported_text} {error_message}" status = "completed" else: - # Complete failure title = f"Failed: {connector_name}" message = f"Sync failed: {error_message}" + if unsupported_text: + message += unsupported_text status = "failed" else: title = f"Ready: {connector_name}" if indexed_count == 0: - if skipped_count and skipped_count > 0: - skipped_item_text = "item" if skipped_count == 1 else "items" - message = f"Already up to date! {skipped_count} {skipped_item_text} skipped (already indexed)." + if unsupported_count and unsupported_count > 0: + message = f"Sync complete.{unsupported_text}" else: - message = "Already up to date! No new items to sync." + message = "Already up to date!" else: - item_text = "item" if indexed_count == 1 else "items" - message = ( - f"Now searchable! {indexed_count} {item_text} synced{skipped_text}." - ) + file_text = "file" if indexed_count == 1 else "files" + message = f"Now searchable! {indexed_count} {file_text} synced." + if unsupported_text: + message += unsupported_text status = "completed" metadata_updates = { "indexed_count": indexed_count, "skipped_count": skipped_count or 0, + "unsupported_count": unsupported_count or 0, "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed", diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py index 7d16c5864..2c2cd65d2 100644 --- a/surfsense_backend/app/services/vision_autocomplete_service.py +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -8,7 +8,7 @@ Optimized pipeline: """ import logging -from typing import AsyncGenerator +from collections.abc import AsyncGenerator from langchain_core.messages import HumanMessage from sqlalchemy.ext.asyncio import AsyncSession diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py index 87b3c55df..4a49944c2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py @@ -51,7 +51,10 @@ async def _should_skip_file( file_id = file.get("id", "") file_name = file.get("name", "Unknown") - if skip_item(file): + skip, unsup_ext = skip_item(file) + if skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" return True, "folder/non-downloadable" if not file_id: return True, "missing file_id" @@ -251,6 +254,121 @@ async def _download_and_index( return batch_indexed, download_failed + batch_failed +async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): + """Remove a document that was deleted in Dropbox.""" + primary_hash = compute_identifier_hash( + DocumentType.DROPBOX_FILE.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, primary_hash) + + if not existing: + result = await session.execute( + select(Document).where( + Document.search_space_id == search_space_id, + Document.document_type == DocumentType.DROPBOX_FILE, + cast(Document.document_metadata["dropbox_file_id"], String) == file_id, + ) + ) + existing = result.scalar_one_or_none() + + if existing: + await session.delete(existing) + + +async def _index_with_delta_sync( + dropbox_client: DropboxClient, + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + cursor: str, + task_logger: TaskLoggingService, + log_entry: object, + max_files: int, + on_heartbeat_callback: HeartbeatCallbackType | None = None, + enable_summary: bool = True, +) -> tuple[int, int, int, str]: + """Delta sync using Dropbox cursor-based change tracking. + + Returns (indexed_count, skipped_count, new_cursor). + """ + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync from cursor: {cursor[:20]}...", + {"stage": "delta_sync", "cursor_prefix": cursor[:20]}, + ) + + entries, new_cursor, error = await dropbox_client.get_changes(cursor) + if error: + err_lower = error.lower() + if "401" in error or "authentication expired" in err_lower: + raise Exception( + f"Dropbox authentication failed. Please re-authenticate. (Error: {error})" + ) + raise Exception(f"Failed to fetch Dropbox changes: {error}") + + if not entries: + logger.info("No changes detected since last sync") + return 0, 0, 0, new_cursor or cursor + + logger.info(f"Processing {len(entries)} change entries") + + renamed_count = 0 + skipped = 0 + unsupported_count = 0 + files_to_download: list[dict] = [] + files_processed = 0 + + for entry in entries: + if files_processed >= max_files: + break + files_processed += 1 + + tag = entry.get(".tag") + + if tag == "deleted": + path_lower = entry.get("path_lower", "") + name = entry.get("name", "") + file_id = entry.get("id", "") + if file_id: + await _remove_document(session, file_id, search_space_id) + logger.debug(f"Processed deletion: {name or path_lower}") + continue + + if tag != "file": + continue + + skip, msg = await _should_skip_file(session, entry, search_space_id) + if skip: + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): + renamed_count += 1 + else: + skipped += 1 + continue + + files_to_download.append(entry) + + batch_indexed, failed = await _download_and_index( + dropbox_client, + session, + files_to_download, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + on_heartbeat=on_heartbeat_callback, + ) + + indexed = renamed_count + batch_indexed + logger.info( + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" + ) + return indexed, skipped, unsupported_count, new_cursor or cursor + + async def _index_full_scan( dropbox_client: DropboxClient, session: AsyncSession, @@ -266,8 +384,11 @@ async def _index_full_scan( incremental_sync: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name}", @@ -287,6 +408,7 @@ async def _index_full_scan( renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] all_files, error = await get_files_in_folder( @@ -306,14 +428,21 @@ async def _index_full_scan( if incremental_sync: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue - elif skip_item(file): - skipped += 1 - continue + else: + item_skip, item_unsup = skip_item(file) + if item_skip: + if item_unsup: + unsupported_count += 1 + else: + skipped += 1 + continue file_pages = PageLimitService.estimate_pages_from_metadata( file.get("name", ""), file.get("size") @@ -352,9 +481,10 @@ async def _index_full_scan( indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_selected_files( @@ -368,7 +498,7 @@ async def _index_selected_files( enable_summary: bool, incremental_sync: bool = True, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) @@ -379,6 +509,7 @@ async def _index_selected_files( errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_path, file_name in file_paths: file, error = await get_file_by_path(dropbox_client, file_path) @@ -390,14 +521,21 @@ async def _index_selected_files( if incremental_sync: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue - elif skip_item(file): - skipped += 1 - continue + else: + item_skip, item_unsup = skip_item(file) + if item_skip: + if item_unsup: + unsupported_count += 1 + else: + skipped += 1 + continue file_pages = PageLimitService.estimate_pages_from_metadata( file.get("name", ""), file.get("size") @@ -429,7 +567,7 @@ async def _index_selected_files( user_id, pages_to_deduct, allow_exceed=True ) - return renamed_count + batch_indexed, skipped, errors + return renamed_count + batch_indexed, skipped, unsupported_count, errors async def index_dropbox_files( @@ -438,7 +576,7 @@ async def index_dropbox_files( search_space_id: int, user_id: str, items_dict: dict, -) -> tuple[int, int, str | None]: +) -> tuple[int, int, str | None, int]: """Index Dropbox files for a specific connector. items_dict format: @@ -469,7 +607,7 @@ async def index_dropbox_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 token_encrypted = connector.config.get("_token_encrypted", False) if token_encrypted and not config.SECRET_KEY: @@ -480,7 +618,7 @@ async def index_dropbox_files( "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) dropbox_client = DropboxClient(session, connector_id) @@ -489,9 +627,13 @@ async def index_dropbox_files( max_files = indexing_options.get("max_files", 500) incremental_sync = indexing_options.get("incremental_sync", True) include_subfolders = indexing_options.get("include_subfolders", True) + use_delta_sync = indexing_options.get("use_delta_sync", True) + + folder_cursors: dict = connector.config.get("folder_cursors", {}) total_indexed = 0 total_skipped = 0 + total_unsupported = 0 selected_files = items_dict.get("files", []) if selected_files: @@ -499,7 +641,7 @@ async def index_dropbox_files( (f.get("path", f.get("path_lower", f.get("id", ""))), f.get("name")) for f in selected_files ] - indexed, skipped, file_errors = await _index_selected_files( + indexed, skipped, unsupported, file_errors = await _index_selected_files( dropbox_client, session, file_tuples, @@ -511,6 +653,7 @@ async def index_dropbox_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsupported if file_errors: logger.warning( f"File indexing errors for connector {connector_id}: {file_errors}" @@ -523,25 +666,66 @@ async def index_dropbox_files( ) folder_name = folder.get("name", "Root") - logger.info(f"Using full scan for folder {folder_name}") - indexed, skipped = await _index_full_scan( - dropbox_client, - session, - connector_id, - search_space_id, - user_id, - folder_path, - folder_name, - task_logger, - log_entry, - max_files, - include_subfolders, - incremental_sync=incremental_sync, - enable_summary=connector_enable_summary, + saved_cursor = folder_cursors.get(folder_path) + can_use_delta = ( + use_delta_sync and saved_cursor and connector.last_indexed_at ) + + if can_use_delta: + logger.info(f"Using delta sync for folder {folder_name}") + indexed, skipped, unsup, new_cursor = await _index_with_delta_sync( + dropbox_client, + session, + connector_id, + search_space_id, + user_id, + saved_cursor, + task_logger, + log_entry, + max_files, + enable_summary=connector_enable_summary, + ) + folder_cursors[folder_path] = new_cursor + total_unsupported += unsup + else: + logger.info(f"Using full scan for folder {folder_name}") + indexed, skipped, unsup = await _index_full_scan( + dropbox_client, + session, + connector_id, + search_space_id, + user_id, + folder_path, + folder_name, + task_logger, + log_entry, + max_files, + include_subfolders, + incremental_sync=incremental_sync, + enable_summary=connector_enable_summary, + ) + total_unsupported += unsup + total_indexed += indexed total_skipped += skipped + # Persist latest cursor for this folder + try: + latest_cursor, cursor_err = await dropbox_client.get_latest_cursor( + folder_path + ) + if latest_cursor and not cursor_err: + folder_cursors[folder_path] = latest_cursor + except Exception as e: + logger.warning(f"Failed to get latest cursor for {folder_path}: {e}") + + # Persist folder cursors to connector config + if folders: + cfg = dict(connector.config) + cfg["folder_cursors"] = folder_cursors + connector.config = cfg + flag_modified(connector, "config") + if total_indexed > 0 or folders: await update_connector_last_indexed(session, connector, True) @@ -550,12 +734,18 @@ async def index_dropbox_files( await task_logger.log_task_success( log_entry, f"Successfully completed Dropbox indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( - f"Dropbox indexing completed: {total_indexed} indexed, {total_skipped} skipped" + f"Dropbox indexing completed: {total_indexed} indexed, " + f"{total_skipped} skipped, {total_unsupported} unsupported" ) - return total_indexed, total_skipped, None + + return total_indexed, total_skipped, None, total_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -566,7 +756,7 @@ async def index_dropbox_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -576,4 +766,4 @@ async def index_dropbox_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Dropbox files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index Dropbox files: {e!s}" + return 0, 0, f"Failed to index Dropbox files: {e!s}", 0 diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 5e9e0f62f..b11087fe6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -25,7 +25,11 @@ from app.connectors.google_drive import ( get_files_in_folder, get_start_page_token, ) -from app.connectors.google_drive.file_types import should_skip_file as skip_mime +from app.connectors.google_drive.file_types import ( + is_google_workspace_file, + should_skip_by_extension, + should_skip_file as skip_mime, +) from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import compute_identifier_hash @@ -78,6 +82,10 @@ async def _should_skip_file( if skip_mime(mime_type): return True, "folder/shortcut" + if not is_google_workspace_file(mime_type): + ext_skip, unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return True, f"unsupported:{unsup_ext}" if not file_id: return True, "missing file_id" @@ -468,13 +476,13 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline. Phase 1 (serial): fetch metadata + skip checks. Phase 2+3 (parallel): download, ETL, index via _download_and_index. - Returns (indexed_count, skipped_count, errors). + Returns (indexed_count, skipped_count, unsupported_count, errors). """ page_limit_service = PageLimitService(session) pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) @@ -485,6 +493,7 @@ async def _index_selected_files( errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_id, file_name in file_ids: file, error = await get_file_by_id(drive_client, file_id) @@ -495,7 +504,9 @@ async def _index_selected_files( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -539,7 +550,7 @@ async def _index_selected_files( user_id, pages_to_deduct, allow_exceed=True ) - return renamed_count + batch_indexed, skipped, errors + return renamed_count + batch_indexed, skipped, unsupported_count, errors # --------------------------------------------------------------------------- @@ -562,8 +573,11 @@ async def _index_full_scan( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})", @@ -585,6 +599,7 @@ async def _index_full_scan( renamed_count = 0 skipped = 0 + unsupported_count = 0 files_processed = 0 files_to_download: list[dict] = [] folders_to_process = [(folder_id, folder_name)] @@ -625,7 +640,9 @@ async def _index_full_scan( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -698,9 +715,10 @@ async def _index_full_scan( indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_with_delta_sync( @@ -718,8 +736,11 @@ async def _index_with_delta_sync( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Delta sync using change tracking.""" +) -> tuple[int, int, int]: + """Delta sync using change tracking. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting delta sync from token: {start_page_token[:20]}...", @@ -739,7 +760,7 @@ async def _index_with_delta_sync( if not changes: logger.info("No changes detected since last sync") - return 0, 0 + return 0, 0, 0 logger.info(f"Processing {len(changes)} changes") @@ -754,6 +775,7 @@ async def _index_with_delta_sync( renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] files_processed = 0 @@ -775,7 +797,9 @@ async def _index_with_delta_sync( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -832,9 +856,10 @@ async def _index_with_delta_sync( indexed = renamed_count + batch_indexed logger.info( - f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count # --------------------------------------------------------------------------- @@ -854,8 +879,11 @@ async def index_google_drive_files( max_files: int = 500, include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, -) -> tuple[int, int, str | None]: - """Index Google Drive files for a specific connector.""" +) -> tuple[int, int, str | None, int]: + """Index Google Drive files for a specific connector. + + Returns (indexed, skipped, error_or_none, unsupported_count). + """ task_logger = TaskLoggingService(session, search_space_id) log_entry = await task_logger.log_task_start( task_name="google_drive_files_indexing", @@ -881,7 +909,7 @@ async def index_google_drive_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 await task_logger.log_task_progress( log_entry, @@ -900,7 +928,7 @@ async def index_google_drive_files( "Missing Composio account", {"error_type": "MissingComposioAccount"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 pre_built_credentials = build_composio_credentials(connected_account_id) else: token_encrypted = connector.config.get("_token_encrypted", False) @@ -915,6 +943,7 @@ async def index_google_drive_files( 0, 0, "SECRET_KEY not configured but credentials are marked as encrypted", + 0, ) connector_enable_summary = getattr(connector, "enable_summary", True) @@ -927,7 +956,7 @@ async def index_google_drive_files( await task_logger.log_task_failure( log_entry, error_msg, {"error_type": "MissingParameter"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 target_folder_id = folder_id target_folder_name = folder_name or "Selected Folder" @@ -938,9 +967,11 @@ async def index_google_drive_files( use_delta_sync and start_page_token and connector.last_indexed_at ) + documents_unsupported = 0 + if can_use_delta: logger.info(f"Using delta sync for connector {connector_id}") - documents_indexed, documents_skipped = await _index_with_delta_sync( + documents_indexed, documents_skipped, du = await _index_with_delta_sync( drive_client, session, connector, @@ -956,8 +987,9 @@ async def index_google_drive_files( on_heartbeat_callback, connector_enable_summary, ) + documents_unsupported += du logger.info("Running reconciliation scan after delta sync") - ri, rs = await _index_full_scan( + ri, rs, ru = await _index_full_scan( drive_client, session, connector, @@ -975,9 +1007,14 @@ async def index_google_drive_files( ) documents_indexed += ri documents_skipped += rs + documents_unsupported += ru else: logger.info(f"Using full scan for connector {connector_id}") - documents_indexed, documents_skipped = await _index_full_scan( + ( + documents_indexed, + documents_skipped, + documents_unsupported, + ) = await _index_full_scan( drive_client, session, connector, @@ -1012,14 +1049,17 @@ async def index_google_drive_files( { "files_processed": documents_indexed, "files_skipped": documents_skipped, + "files_unsupported": documents_unsupported, "sync_type": "delta" if can_use_delta else "full", "folder": target_folder_name, }, ) logger.info( - f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped" + f"Google Drive indexing completed: {documents_indexed} indexed, " + f"{documents_skipped} skipped, {documents_unsupported} unsupported" ) - return documents_indexed, documents_skipped, None + + return documents_indexed, documents_skipped, None, documents_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -1030,7 +1070,7 @@ async def index_google_drive_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -1040,7 +1080,7 @@ async def index_google_drive_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index Google Drive files: {e!s}" + return 0, 0, f"Failed to index Google Drive files: {e!s}", 0 async def index_google_drive_single_file( @@ -1242,7 +1282,7 @@ async def index_google_drive_selected_files( session, connector_id, credentials=pre_built_credentials ) - indexed, skipped, errors = await _index_selected_files( + indexed, skipped, unsupported, errors = await _index_selected_files( drive_client, session, files, @@ -1253,6 +1293,11 @@ async def index_google_drive_selected_files( on_heartbeat=on_heartbeat_callback, ) + if unsupported > 0: + file_text = "file was" if unsupported == 1 else "files were" + unsup_msg = f"{unsupported} {file_text} not supported" + errors.append(unsup_msg) + await session.commit() if errors: @@ -1260,7 +1305,12 @@ async def index_google_drive_selected_files( log_entry, f"Batch file indexing completed with {len(errors)} error(s)", "; ".join(errors), - {"indexed": indexed, "skipped": skipped, "error_count": len(errors)}, + { + "indexed": indexed, + "skipped": skipped, + "unsupported": unsupported, + "error_count": len(errors), + }, ) else: await task_logger.log_task_success( diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index fa50e86d3..7f42f4638 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -23,7 +23,6 @@ from sqlalchemy import select from sqlalchemy.exc import IntegrityError, SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import ( Document, DocumentStatus, @@ -44,132 +43,6 @@ from .base import ( logger, ) -PLAINTEXT_EXTENSIONS = frozenset( - { - ".md", - ".markdown", - ".txt", - ".text", - ".json", - ".jsonl", - ".yaml", - ".yml", - ".toml", - ".ini", - ".cfg", - ".conf", - ".xml", - ".css", - ".scss", - ".less", - ".sass", - ".py", - ".pyw", - ".pyi", - ".pyx", - ".js", - ".jsx", - ".ts", - ".tsx", - ".mjs", - ".cjs", - ".java", - ".kt", - ".kts", - ".scala", - ".groovy", - ".c", - ".h", - ".cpp", - ".cxx", - ".cc", - ".hpp", - ".hxx", - ".cs", - ".fs", - ".fsx", - ".go", - ".rs", - ".rb", - ".php", - ".pl", - ".pm", - ".lua", - ".swift", - ".m", - ".mm", - ".r", - ".R", - ".jl", - ".sh", - ".bash", - ".zsh", - ".fish", - ".bat", - ".cmd", - ".ps1", - ".sql", - ".graphql", - ".gql", - ".env", - ".gitignore", - ".dockerignore", - ".editorconfig", - ".makefile", - ".cmake", - ".log", - ".rst", - ".tex", - ".bib", - ".org", - ".adoc", - ".asciidoc", - ".vue", - ".svelte", - ".astro", - ".tf", - ".hcl", - ".proto", - } -) - -AUDIO_EXTENSIONS = frozenset( - { - ".mp3", - ".mp4", - ".mpeg", - ".mpga", - ".m4a", - ".wav", - ".webm", - } -) - - -DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"}) - - -def _is_plaintext_file(filename: str) -> bool: - return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS - - -def _is_audio_file(filename: str) -> bool: - return Path(filename).suffix.lower() in AUDIO_EXTENSIONS - - -def _is_direct_convert_file(filename: str) -> bool: - return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS - - -def _needs_etl(filename: str) -> bool: - """File is not plaintext, not audio, and not direct-convert — requires ETL.""" - return ( - not _is_plaintext_file(filename) - and not _is_audio_file(filename) - and not _is_direct_convert_file(filename) - ) - - HeartbeatCallbackType = Callable[[int], Awaitable[None]] @@ -279,57 +152,19 @@ def scan_folder( return files -def _read_plaintext_file(file_path: str) -> str: - """Read a plaintext/text-based file as UTF-8.""" - with open(file_path, encoding="utf-8", errors="replace") as f: - content = f.read() - if "\x00" in content: - raise ValueError( - f"File contains null bytes — likely a binary file opened as text: {file_path}" - ) - return content - - async def _read_file_content(file_path: str, filename: str) -> str: - """Read file content, using ETL for binary formats. + """Read file content via the unified ETL pipeline. - Plaintext files are read directly. Audio and document files (PDF, DOCX, etc.) - are routed through the configured ETL service (same as Google Drive / OneDrive). - - Raises ValueError if the file cannot be parsed (e.g. no ETL service configured - for a binary file). + All file types (plaintext, audio, direct-convert, document) are handled + by ``EtlPipelineService``. """ - if _is_plaintext_file(filename): - return _read_plaintext_file(file_path) + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - if _is_direct_convert_file(filename): - from app.tasks.document_processors._direct_converters import ( - convert_file_directly, - ) - - return convert_file_directly(file_path, filename) - - if _is_audio_file(filename): - etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None - stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None - if not stt_service_val and not etl_service: - raise ValueError( - f"No STT_SERVICE configured — cannot transcribe audio file: {filename}" - ) - - if _needs_etl(filename): - etl_service = getattr(config, "ETL_SERVICE", None) - if not etl_service: - raise ValueError( - f"No ETL_SERVICE configured — cannot parse binary file: {filename}. " - f"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" - ) - - from app.connectors.onedrive.content_extractor import ( - _parse_file_to_markdown, + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) ) - - return await _parse_file_to_markdown(file_path, filename) + return result.markdown_content def _content_hash(content: str, search_space_id: int) -> str: diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py index 2301b6260..06517f542 100644 --- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py @@ -56,7 +56,10 @@ async def _should_skip_file( file_id = file.get("id") file_name = file.get("name", "Unknown") - if skip_item(file): + skip, unsup_ext = skip_item(file) + if skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" return True, "folder/onenote/remote" if not file_id: return True, "missing file_id" @@ -290,7 +293,7 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" page_limit_service = PageLimitService(session) pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) @@ -301,6 +304,7 @@ async def _index_selected_files( errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_id, file_name in file_ids: file, error = await get_file_by_id(onedrive_client, file_id) @@ -311,7 +315,9 @@ async def _index_selected_files( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -347,7 +353,7 @@ async def _index_selected_files( user_id, pages_to_deduct, allow_exceed=True ) - return renamed_count + batch_indexed, skipped, errors + return renamed_count + batch_indexed, skipped, unsupported_count, errors # --------------------------------------------------------------------------- @@ -369,8 +375,11 @@ async def _index_full_scan( include_subfolders: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name}", @@ -389,6 +398,7 @@ async def _index_full_scan( renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] all_files, error = await get_files_in_folder( @@ -407,7 +417,9 @@ async def _index_full_scan( for file in all_files[:max_files]: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -450,9 +462,10 @@ async def _index_full_scan( indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_with_delta_sync( @@ -468,8 +481,11 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int, str | None]: - """Delta sync using OneDrive change tracking. Returns (indexed, skipped, new_delta_link).""" +) -> tuple[int, int, int, str | None]: + """Delta sync using OneDrive change tracking. + + Returns (indexed, skipped, unsupported_count, new_delta_link). + """ await task_logger.log_task_progress( log_entry, "Starting delta sync", @@ -489,7 +505,7 @@ async def _index_with_delta_sync( if not changes: logger.info("No changes detected since last sync") - return 0, 0, new_delta_link + return 0, 0, 0, new_delta_link logger.info(f"Processing {len(changes)} delta changes") @@ -501,6 +517,7 @@ async def _index_with_delta_sync( renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] files_processed = 0 @@ -523,7 +540,9 @@ async def _index_with_delta_sync( skip, msg = await _should_skip_file(session, change, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 @@ -566,9 +585,10 @@ async def _index_with_delta_sync( indexed = renamed_count + batch_indexed logger.info( - f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped, new_delta_link + return indexed, skipped, unsupported_count, new_delta_link # --------------------------------------------------------------------------- @@ -582,7 +602,7 @@ async def index_onedrive_files( search_space_id: int, user_id: str, items_dict: dict, -) -> tuple[int, int, str | None]: +) -> tuple[int, int, str | None, int]: """Index OneDrive files for a specific connector. items_dict format: @@ -609,7 +629,7 @@ async def index_onedrive_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 token_encrypted = connector.config.get("_token_encrypted", False) if token_encrypted and not config.SECRET_KEY: @@ -620,7 +640,7 @@ async def index_onedrive_files( "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) onedrive_client = OneDriveClient(session, connector_id) @@ -632,12 +652,13 @@ async def index_onedrive_files( total_indexed = 0 total_skipped = 0 + total_unsupported = 0 # Index selected individual files selected_files = items_dict.get("files", []) if selected_files: file_tuples = [(f["id"], f.get("name")) for f in selected_files] - indexed, skipped, _errors = await _index_selected_files( + indexed, skipped, unsupported, _errors = await _index_selected_files( onedrive_client, session, file_tuples, @@ -648,6 +669,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsupported # Index selected folders folders = items_dict.get("folders", []) @@ -661,7 +683,7 @@ async def index_onedrive_files( if can_use_delta: logger.info(f"Using delta sync for folder {folder_name}") - indexed, skipped, new_delta_link = await _index_with_delta_sync( + indexed, skipped, unsup, new_delta_link = await _index_with_delta_sync( onedrive_client, session, connector_id, @@ -676,6 +698,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsup if new_delta_link: await session.refresh(connector) @@ -685,7 +708,7 @@ async def index_onedrive_files( flag_modified(connector, "config") # Reconciliation full scan - ri, rs = await _index_full_scan( + ri, rs, ru = await _index_full_scan( onedrive_client, session, connector_id, @@ -701,9 +724,10 @@ async def index_onedrive_files( ) total_indexed += ri total_skipped += rs + total_unsupported += ru else: logger.info(f"Using full scan for folder {folder_name}") - indexed, skipped = await _index_full_scan( + indexed, skipped, unsup = await _index_full_scan( onedrive_client, session, connector_id, @@ -719,6 +743,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsup # Store new delta link for this folder _, new_delta_link, _ = await onedrive_client.get_delta(folder_id=folder_id) @@ -737,12 +762,18 @@ async def index_onedrive_files( await task_logger.log_task_success( log_entry, f"Successfully completed OneDrive indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( - f"OneDrive indexing completed: {total_indexed} indexed, {total_skipped} skipped" + f"OneDrive indexing completed: {total_indexed} indexed, " + f"{total_skipped} skipped, {total_unsupported} unsupported" ) - return total_indexed, total_skipped, None + + return total_indexed, total_skipped, None, total_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -753,7 +784,7 @@ async def index_onedrive_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -763,4 +794,4 @@ async def index_onedrive_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index OneDrive files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index OneDrive files: {e!s}" + return 0, 0, f"Failed to index OneDrive files: {e!s}", 0 diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py index 2b5690d02..f82c10883 100644 --- a/surfsense_backend/app/tasks/document_processors/__init__.py +++ b/surfsense_backend/app/tasks/document_processors/__init__.py @@ -1,41 +1,17 @@ """ Document processors module for background tasks. -This module provides a collection of document processors for different content types -and sources. Each processor is responsible for handling a specific type of document -processing task in the background. - -Available processors: -- Extension processor: Handle documents from browser extension -- Markdown processor: Process markdown files -- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling) -- YouTube processor: Process YouTube videos and extract transcripts +Content extraction is handled by ``app.etl_pipeline.EtlPipelineService``. +This package keeps orchestration (save, notify, page-limit) and +non-ETL processors (extension, markdown, youtube). """ -# Extension processor -# File processors (backward-compatible re-exports from _save) -from ._save import ( - add_received_file_document_using_docling, - add_received_file_document_using_llamacloud, - add_received_file_document_using_unstructured, -) from .extension_processor import add_extension_received_document - -# Markdown processor from .markdown_processor import add_received_markdown_file_document - -# YouTube processor from .youtube_processor import add_youtube_video_document __all__ = [ - # Extension processing "add_extension_received_document", - # File processing with different ETL services - "add_received_file_document_using_docling", - "add_received_file_document_using_llamacloud", - "add_received_file_document_using_unstructured", - # Markdown file processing "add_received_markdown_file_document", - # YouTube video processing "add_youtube_video_document", ] diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py deleted file mode 100644 index f74d7acce..000000000 --- a/surfsense_backend/app/tasks/document_processors/_constants.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Constants for file document processing. - -Centralizes file type classification, LlamaCloud retry configuration, -and timeout calculation parameters. -""" - -import ssl -from enum import Enum - -import httpx - -# --------------------------------------------------------------------------- -# File type classification -# --------------------------------------------------------------------------- - -MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt") -AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") -DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm") - - -class FileCategory(Enum): - MARKDOWN = "markdown" - AUDIO = "audio" - DIRECT_CONVERT = "direct_convert" - DOCUMENT = "document" - - -def classify_file(filename: str) -> FileCategory: - """Classify a file by its extension into a processing category.""" - lower = filename.lower() - if lower.endswith(MARKDOWN_EXTENSIONS): - return FileCategory.MARKDOWN - if lower.endswith(AUDIO_EXTENSIONS): - return FileCategory.AUDIO - if lower.endswith(DIRECT_CONVERT_EXTENSIONS): - return FileCategory.DIRECT_CONVERT - return FileCategory.DOCUMENT - - -# --------------------------------------------------------------------------- -# LlamaCloud retry configuration -# --------------------------------------------------------------------------- - -LLAMACLOUD_MAX_RETRIES = 5 -LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base) -LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes) -LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( - ssl.SSLError, - httpx.ConnectError, - httpx.ConnectTimeout, - httpx.ReadError, - httpx.ReadTimeout, - httpx.WriteError, - httpx.WriteTimeout, - httpx.RemoteProtocolError, - httpx.LocalProtocolError, - ConnectionError, - ConnectionResetError, - TimeoutError, - OSError, -) - -# --------------------------------------------------------------------------- -# Timeout calculation constants -# --------------------------------------------------------------------------- - -UPLOAD_BYTES_PER_SECOND_SLOW = ( - 100 * 1024 -) # 100 KB/s (conservative for slow connections) -MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file -MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files -BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing -PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py index b1a69ef4f..bbff4838e 100644 --- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py +++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py @@ -4,8 +4,8 @@ Lossless file-to-markdown converters for text-based formats. These converters handle file types that can be faithfully represented as markdown without any external ETL/OCR service: -- CSV / TSV → markdown table (stdlib ``csv``) -- HTML / HTM → markdown (``markdownify``) +- CSV / TSV → markdown table (stdlib ``csv``) +- HTML / HTM / XHTML → markdown (``markdownify``) """ from __future__ import annotations @@ -73,6 +73,7 @@ _CONVERTER_MAP: dict[str, Callable[..., str]] = { ".tsv": tsv_to_markdown, ".html": html_to_markdown, ".htm": html_to_markdown, + ".xhtml": html_to_markdown, } diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py deleted file mode 100644 index cc3a8b1ac..000000000 --- a/surfsense_backend/app/tasks/document_processors/_etl.py +++ /dev/null @@ -1,209 +0,0 @@ -""" -ETL parsing strategies for different document processing services. - -Provides parse functions for Unstructured, LlamaCloud, and Docling, along with -LlamaCloud retry logic and dynamic timeout calculations. -""" - -import asyncio -import logging -import os -import random -import warnings -from logging import ERROR, getLogger - -import httpx - -from app.config import config as app_config -from app.db import Log -from app.services.task_logging_service import TaskLoggingService - -from ._constants import ( - LLAMACLOUD_BASE_DELAY, - LLAMACLOUD_MAX_DELAY, - LLAMACLOUD_MAX_RETRIES, - LLAMACLOUD_RETRYABLE_EXCEPTIONS, - PER_PAGE_JOB_TIMEOUT, -) -from ._helpers import calculate_job_timeout, calculate_upload_timeout - -# --------------------------------------------------------------------------- -# LlamaCloud parsing with retry -# --------------------------------------------------------------------------- - - -async def parse_with_llamacloud_retry( - file_path: str, - estimated_pages: int, - task_logger: TaskLoggingService | None = None, - log_entry: Log | None = None, -): - """ - Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. - - Uses dynamic timeout calculations based on file size and page count to handle - very large files reliably. - - Returns: - LlamaParse result object - - Raises: - Exception: If all retries fail - """ - from llama_cloud_services import LlamaParse - from llama_cloud_services.parse.utils import ResultType - - file_size_bytes = os.path.getsize(file_path) - file_size_mb = file_size_bytes / (1024 * 1024) - - upload_timeout = calculate_upload_timeout(file_size_bytes) - job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) - - custom_timeout = httpx.Timeout( - connect=120.0, - read=upload_timeout, - write=upload_timeout, - pool=120.0, - ) - - logging.info( - f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " - f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " - f"job_timeout={job_timeout:.0f}s" - ) - - last_exception = None - attempt_errors: list[str] = [] - - for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): - try: - async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: - parser = LlamaParse( - api_key=app_config.LLAMA_CLOUD_API_KEY, - num_workers=1, - verbose=True, - language="en", - result_type=ResultType.MD, - max_timeout=int(max(2000, job_timeout + upload_timeout)), - job_timeout_in_seconds=job_timeout, - job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, - custom_client=custom_client, - ) - result = await parser.aparse(file_path) - - if attempt > 1: - logging.info( - f"LlamaCloud upload succeeded on attempt {attempt} after " - f"{len(attempt_errors)} failures" - ) - return result - - except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: - last_exception = e - error_type = type(e).__name__ - error_msg = str(e)[:200] - attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") - - if attempt < LLAMACLOUD_MAX_RETRIES: - base_delay = min( - LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), - LLAMACLOUD_MAX_DELAY, - ) - jitter = base_delay * 0.25 * (2 * random.random() - 1) - delay = base_delay + jitter - - if task_logger and log_entry: - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud upload failed " - f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), " - f"retrying in {delay:.0f}s", - { - "error_type": error_type, - "error_message": error_msg, - "attempt": attempt, - "retry_delay": delay, - "file_size_mb": round(file_size_mb, 1), - "upload_timeout": upload_timeout, - }, - ) - else: - logging.warning( - f"LlamaCloud upload failed " - f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " - f"{error_type}. File: {file_size_mb:.1f}MB. " - f"Retrying in {delay:.0f}s..." - ) - - await asyncio.sleep(delay) - else: - logging.error( - f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} " - f"attempts. File size: {file_size_mb:.1f}MB, " - f"Pages: {estimated_pages}. " - f"Errors: {'; '.join(attempt_errors)}" - ) - - except Exception: - raise - - raise last_exception or RuntimeError( - f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " - f"File size: {file_size_mb:.1f}MB" - ) - - -# --------------------------------------------------------------------------- -# Per-service parse functions -# --------------------------------------------------------------------------- - - -async def parse_with_unstructured(file_path: str): - """ - Parse a file using the Unstructured ETL service. - - Returns: - List of LangChain Document elements. - """ - from langchain_unstructured import UnstructuredLoader - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - return await loader.aload() - - -async def parse_with_docling(file_path: str, filename: str) -> str: - """ - Parse a file using the Docling ETL service (via the Docling service wrapper). - - Returns: - Markdown content string. - """ - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - pdfminer_logger = getLogger("pdfminer") - original_level = pdfminer_logger.level - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") - warnings.filterwarnings( - "ignore", message=".*Cannot set gray non-stroke color.*" - ) - warnings.filterwarnings("ignore", message=".*invalid float value.*") - pdfminer_logger.setLevel(ERROR) - - try: - result = await docling_service.process_document(file_path, filename) - finally: - pdfminer_logger.setLevel(original_level) - - return result["content"] diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py index 7ac05932c..9cd7b87c9 100644 --- a/surfsense_backend/app/tasks/document_processors/_helpers.py +++ b/surfsense_backend/app/tasks/document_processors/_helpers.py @@ -11,13 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import Document, DocumentStatus, DocumentType from app.utils.document_converters import generate_unique_identifier_hash -from ._constants import ( - BASE_JOB_TIMEOUT, - MAX_UPLOAD_TIMEOUT, - MIN_UPLOAD_TIMEOUT, - PER_PAGE_JOB_TIMEOUT, - UPLOAD_BYTES_PER_SECOND_SLOW, -) from .base import ( check_document_by_unique_identifier, check_duplicate_document, @@ -198,21 +191,3 @@ async def update_document_from_connector( if "connector_id" in connector: document.connector_id = connector["connector_id"] await session.commit() - - -# --------------------------------------------------------------------------- -# Timeout calculations -# --------------------------------------------------------------------------- - - -def calculate_upload_timeout(file_size_bytes: int) -> float: - """Calculate upload timeout based on file size (conservative for slow connections).""" - estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 - return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) - - -def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: - """Calculate job processing timeout based on page count and file size.""" - page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) - size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 - return max(page_based_timeout, size_based_timeout) diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py index 5088ad004..ae45f7a69 100644 --- a/surfsense_backend/app/tasks/document_processors/_save.py +++ b/surfsense_backend/app/tasks/document_processors/_save.py @@ -1,14 +1,9 @@ """ Unified document save/update logic for file processors. - -Replaces the three nearly-identical ``add_received_file_document_using_*`` -functions with a single ``save_file_document`` function plus thin wrappers -for backward compatibility. """ import logging -from langchain_core.documents import Document as LangChainDocument from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession @@ -207,79 +202,3 @@ async def save_file_document( raise RuntimeError( f"Failed to process file document using {etl_service}: {e!s}" ) from e - - -# --------------------------------------------------------------------------- -# Backward-compatible wrapper functions -# --------------------------------------------------------------------------- - - -async def add_received_file_document_using_unstructured( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: list[LangChainDocument], - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """Process and store a file document using the Unstructured service.""" - from app.utils.document_converters import convert_document_to_markdown - - markdown_content = await convert_document_to_markdown( - unstructured_processed_elements - ) - return await save_file_document( - session, - file_name, - markdown_content, - search_space_id, - user_id, - "UNSTRUCTURED", - connector, - enable_summary, - ) - - -async def add_received_file_document_using_llamacloud( - session: AsyncSession, - file_name: str, - llamacloud_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """Process and store document content parsed by LlamaCloud.""" - return await save_file_document( - session, - file_name, - llamacloud_markdown_document, - search_space_id, - user_id, - "LLAMACLOUD", - connector, - enable_summary, - ) - - -async def add_received_file_document_using_docling( - session: AsyncSession, - file_name: str, - docling_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """Process and store document content parsed by Docling.""" - return await save_file_document( - session, - file_name, - docling_markdown_document, - search_space_id, - user_id, - "DOCLING", - connector, - enable_summary, - ) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 0c1cad52d..c765dbd87 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1,14 +1,8 @@ """ File document processors orchestrating content extraction and indexing. -This module is the public entry point for file processing. It delegates to -specialised sub-modules that each own a single concern: - -- ``_constants`` — file type classification and configuration constants -- ``_helpers`` — document deduplication, migration, connector helpers -- ``_direct_converters`` — lossless file-to-markdown for csv/tsv/html -- ``_etl`` — ETL parsing strategies (Unstructured, LlamaCloud, Docling) -- ``_save`` — unified document creation / update logic +Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and +keeps only orchestration concerns (notifications, logging, page limits, saving). """ from __future__ import annotations @@ -17,38 +11,19 @@ import contextlib import logging import os from dataclasses import dataclass, field -from logging import ERROR, getLogger from fastapi import HTTPException from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config as app_config from app.db import Document, Log, Notification from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService -from ._constants import FileCategory, classify_file -from ._direct_converters import convert_file_directly -from ._etl import ( - parse_with_docling, - parse_with_llamacloud_retry, - parse_with_unstructured, -) from ._helpers import update_document_from_connector -from ._save import ( - add_received_file_document_using_docling, - add_received_file_document_using_llamacloud, - add_received_file_document_using_unstructured, - save_file_document, -) +from ._save import save_file_document from .markdown_processor import add_received_markdown_file_document -# Re-export public API so existing ``from file_processors import …`` keeps working. __all__ = [ - "add_received_file_document_using_docling", - "add_received_file_document_using_llamacloud", - "add_received_file_document_using_unstructured", - "parse_with_llamacloud_retry", "process_file_in_background", "process_file_in_background_with_document", "save_file_document", @@ -142,35 +117,31 @@ async def _log_page_divergence( # =================================================================== -async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None: - """Read a markdown / text file and create or update a document.""" - await _notify(ctx, "parsing", "Reading file") +async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None: + """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + + await _notify(ctx, "parsing", "Processing file") await ctx.task_logger.log_task_progress( ctx.log_entry, - f"Processing markdown/text file: {ctx.filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, + f"Processing file: {ctx.filename}", + {"processing_stage": "extracting"}, ) - with open(ctx.file_path, encoding="utf-8") as f: - markdown_content = f.read() + etl_result = await EtlPipelineService().extract( + EtlRequest(file_path=ctx.file_path, filename=ctx.filename) + ) with contextlib.suppress(Exception): os.unlink(ctx.file_path) await _notify(ctx, "chunking") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Creating document from markdown content: {ctx.filename}", - { - "processing_stage": "creating_document", - "content_length": len(markdown_content), - }, - ) result = await add_received_markdown_file_document( ctx.session, ctx.filename, - markdown_content, + etl_result.markdown_content, ctx.search_space_id, ctx.user_id, ctx.connector, @@ -181,179 +152,19 @@ async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None: if result: await ctx.task_logger.log_task_success( ctx.log_entry, - f"Successfully processed markdown file: {ctx.filename}", + f"Successfully processed file: {ctx.filename}", { "document_id": result.id, "content_hash": result.content_hash, - "file_type": "markdown", + "file_type": etl_result.content_type, + "etl_service": etl_result.etl_service, }, ) else: await ctx.task_logger.log_task_success( ctx.log_entry, - f"Markdown file already exists (duplicate): {ctx.filename}", - {"duplicate_detected": True, "file_type": "markdown"}, - ) - return result - - -async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None: - """Convert a text-based file (csv/tsv/html) to markdown without ETL.""" - await _notify(ctx, "parsing", "Converting file") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Direct-converting file to markdown: {ctx.filename}", - {"file_type": "direct_convert", "processing_stage": "converting"}, - ) - - markdown_content = convert_file_directly(ctx.file_path, ctx.filename) - - with contextlib.suppress(Exception): - os.unlink(ctx.file_path) - - await _notify(ctx, "chunking") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Creating document from converted content: {ctx.filename}", - { - "processing_stage": "creating_document", - "content_length": len(markdown_content), - }, - ) - - result = await add_received_markdown_file_document( - ctx.session, - ctx.filename, - markdown_content, - ctx.search_space_id, - ctx.user_id, - ctx.connector, - ) - if ctx.connector: - await update_document_from_connector(result, ctx.connector, ctx.session) - - if result: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Successfully direct-converted file: {ctx.filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "direct_convert", - }, - ) - else: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Direct-converted file already exists (duplicate): {ctx.filename}", - {"duplicate_detected": True, "file_type": "direct_convert"}, - ) - return result - - -async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None: - """Transcribe an audio file and create or update a document.""" - await _notify(ctx, "parsing", "Transcribing audio") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Processing audio file for transcription: {ctx.filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - if stt_service_type == "local": - from app.services.stt_service import stt_service - - try: - stt_result = stt_service.transcribe_file(ctx.file_path) - transcribed_text = stt_result.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - transcribed_text = ( - f"# Transcription of {ctx.filename}\n\n{transcribed_text}" - ) - except Exception as e: - raise HTTPException( - status_code=422, - detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}", - ) from e - - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Local STT transcription completed: {ctx.filename}", - { - "processing_stage": "local_transcription_complete", - "language": stt_result.get("language"), - "confidence": stt_result.get("language_probability"), - "duration": stt_result.get("duration"), - }, - ) - else: - from litellm import atranscription - - with open(ctx.file_path, "rb") as audio_file: - transcription_kwargs: dict = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - - transcription_response = await atranscription(**transcription_kwargs) - transcribed_text = transcription_response.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}" - - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Transcription completed, creating document: {ctx.filename}", - { - "processing_stage": "transcription_complete", - "transcript_length": len(transcribed_text), - }, - ) - - await _notify(ctx, "chunking") - - with contextlib.suppress(Exception): - os.unlink(ctx.file_path) - - result = await add_received_markdown_file_document( - ctx.session, - ctx.filename, - transcribed_text, - ctx.search_space_id, - ctx.user_id, - ctx.connector, - ) - if ctx.connector: - await update_document_from_connector(result, ctx.connector, ctx.session) - - if result: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Successfully transcribed and processed audio file: {ctx.filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "audio", - "transcript_length": len(transcribed_text), - "stt_service": stt_service_type, - }, - ) - else: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Audio file transcript already exists (duplicate): {ctx.filename}", - {"duplicate_detected": True, "file_type": "audio"}, + f"File already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": etl_result.content_type}, ) return result @@ -363,279 +174,10 @@ async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None: # --------------------------------------------------------------------------- -async def _etl_unstructured( - ctx: _ProcessingContext, - page_limit_service, - estimated_pages: int, -) -> Document | None: - """Parse and save via the Unstructured ETL service.""" - await _notify(ctx, "parsing", "Extracting content") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Processing file with Unstructured ETL: {ctx.filename}", - { - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "processing_stage": "loading", - }, - ) - - docs = await parse_with_unstructured(ctx.file_path) - - await _notify(ctx, "chunking", chunks_count=len(docs)) - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Unstructured ETL completed, creating document: {ctx.filename}", - {"processing_stage": "etl_complete", "elements_count": len(docs)}, - ) - - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - final_pages = max(estimated_pages, actual_pages) - await _log_page_divergence( - ctx.task_logger, - ctx.log_entry, - ctx.filename, - estimated_pages, - actual_pages, - final_pages, - ) - - with contextlib.suppress(Exception): - os.unlink(ctx.file_path) - - result = await add_received_file_document_using_unstructured( - ctx.session, - ctx.filename, - docs, - ctx.search_space_id, - ctx.user_id, - ctx.connector, - enable_summary=ctx.enable_summary, - ) - if ctx.connector: - await update_document_from_connector(result, ctx.connector, ctx.session) - - if result: - await page_limit_service.update_page_usage( - ctx.user_id, final_pages, allow_exceed=True - ) - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Successfully processed file with Unstructured: {ctx.filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "pages_processed": final_pages, - }, - ) - else: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Document already exists (duplicate): {ctx.filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - }, - ) - return result - - -async def _etl_llamacloud( - ctx: _ProcessingContext, - page_limit_service, - estimated_pages: int, -) -> Document | None: - """Parse and save via the LlamaCloud ETL service.""" - await _notify(ctx, "parsing", "Extracting content") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Processing file with LlamaCloud ETL: {ctx.filename}", - { - "file_type": "document", - "etl_service": "LLAMACLOUD", - "processing_stage": "parsing", - "estimated_pages": estimated_pages, - }, - ) - - raw_result = await parse_with_llamacloud_retry( - file_path=ctx.file_path, - estimated_pages=estimated_pages, - task_logger=ctx.task_logger, - log_entry=ctx.log_entry, - ) - - with contextlib.suppress(Exception): - os.unlink(ctx.file_path) - - markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False) - - await _notify(ctx, "chunking", chunks_count=len(markdown_documents)) - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"LlamaCloud parsing completed, creating documents: {ctx.filename}", - { - "processing_stage": "parsing_complete", - "documents_count": len(markdown_documents), - }, - ) - - if not markdown_documents: - await ctx.task_logger.log_task_failure( - ctx.log_entry, - f"LlamaCloud parsing returned no documents: {ctx.filename}", - "ETL service returned empty document list", - {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"}, - ) - raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}") - - actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents) - final_pages = max(estimated_pages, actual_pages) - await _log_page_divergence( - ctx.task_logger, - ctx.log_entry, - ctx.filename, - estimated_pages, - actual_pages, - final_pages, - ) - - any_created = False - last_doc: Document | None = None - - for doc in markdown_documents: - doc_result = await add_received_file_document_using_llamacloud( - ctx.session, - ctx.filename, - llamacloud_markdown_document=doc.text, - search_space_id=ctx.search_space_id, - user_id=ctx.user_id, - connector=ctx.connector, - enable_summary=ctx.enable_summary, - ) - if doc_result: - any_created = True - last_doc = doc_result - - if any_created: - await page_limit_service.update_page_usage( - ctx.user_id, final_pages, allow_exceed=True - ) - if ctx.connector: - await update_document_from_connector(last_doc, ctx.connector, ctx.session) - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Successfully processed file with LlamaCloud: {ctx.filename}", - { - "document_id": last_doc.id, - "content_hash": last_doc.content_hash, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "pages_processed": final_pages, - "documents_count": len(markdown_documents), - }, - ) - return last_doc - - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Document already exists (duplicate): {ctx.filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "documents_count": len(markdown_documents), - }, - ) - return None - - -async def _etl_docling( - ctx: _ProcessingContext, - page_limit_service, - estimated_pages: int, -) -> Document | None: - """Parse and save via the Docling ETL service.""" - await _notify(ctx, "parsing", "Extracting content") - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Processing file with Docling ETL: {ctx.filename}", - { - "file_type": "document", - "etl_service": "DOCLING", - "processing_stage": "parsing", - }, - ) - - content = await parse_with_docling(ctx.file_path, ctx.filename) - - with contextlib.suppress(Exception): - os.unlink(ctx.file_path) - - await ctx.task_logger.log_task_progress( - ctx.log_entry, - f"Docling parsing completed, creating document: {ctx.filename}", - {"processing_stage": "parsing_complete", "content_length": len(content)}, - ) - - actual_pages = page_limit_service.estimate_pages_from_content_length(len(content)) - final_pages = max(estimated_pages, actual_pages) - await _log_page_divergence( - ctx.task_logger, - ctx.log_entry, - ctx.filename, - estimated_pages, - actual_pages, - final_pages, - ) - - await _notify(ctx, "chunking") - - result = await add_received_file_document_using_docling( - ctx.session, - ctx.filename, - docling_markdown_document=content, - search_space_id=ctx.search_space_id, - user_id=ctx.user_id, - connector=ctx.connector, - enable_summary=ctx.enable_summary, - ) - - if result: - await page_limit_service.update_page_usage( - ctx.user_id, final_pages, allow_exceed=True - ) - if ctx.connector: - await update_document_from_connector(result, ctx.connector, ctx.session) - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Successfully processed file with Docling: {ctx.filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "document", - "etl_service": "DOCLING", - "pages_processed": final_pages, - }, - ) - else: - await ctx.task_logger.log_task_success( - ctx.log_entry, - f"Document already exists (duplicate): {ctx.filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "DOCLING", - }, - ) - return result - - async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: - """Route a document file to the configured ETL service.""" + """Route a document file to the configured ETL service via the unified pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService from app.services.page_limit_service import PageLimitExceededError, PageLimitService page_limit_service = PageLimitService(ctx.session) @@ -665,16 +207,60 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: os.unlink(ctx.file_path) raise HTTPException(status_code=403, detail=str(e)) from e - etl_dispatch = { - "UNSTRUCTURED": _etl_unstructured, - "LLAMACLOUD": _etl_llamacloud, - "DOCLING": _etl_docling, - } - handler = etl_dispatch.get(app_config.ETL_SERVICE) - if handler is None: - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + await _notify(ctx, "parsing", "Extracting content") - return await handler(ctx, page_limit_service, estimated_pages) + etl_result = await EtlPipelineService().extract( + EtlRequest( + file_path=ctx.file_path, + filename=ctx.filename, + estimated_pages=estimated_pages, + ) + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + + result = await save_file_document( + ctx.session, + ctx.filename, + etl_result.markdown_content, + ctx.search_space_id, + ctx.user_id, + etl_result.etl_service, + ctx.connector, + enable_summary=ctx.enable_summary, + ) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, estimated_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": etl_result.etl_service, + "pages_processed": estimated_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": etl_result.etl_service, + }, + ) + return result # =================================================================== @@ -706,15 +292,16 @@ async def process_file_in_background( ) try: - category = classify_file(filename) + from app.etl_pipeline.file_classifier import ( + FileCategory as EtlFileCategory, + classify_file as etl_classify, + ) - if category == FileCategory.MARKDOWN: - return await _process_markdown_upload(ctx) - if category == FileCategory.DIRECT_CONVERT: - return await _process_direct_convert_upload(ctx) - if category == FileCategory.AUDIO: - return await _process_audio_upload(ctx) - return await _process_document_upload(ctx) + category = etl_classify(filename) + + if category == EtlFileCategory.DOCUMENT: + return await _process_document_upload(ctx) + return await _process_non_document_upload(ctx) except Exception as e: await session.rollback() @@ -758,201 +345,64 @@ async def _extract_file_content( Returns: Tuple of (markdown_content, etl_service_name). """ - category = classify_file(filename) - - if category == FileCategory.MARKDOWN: - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) - with open(file_path, encoding="utf-8") as f: - content = f.read() - with contextlib.suppress(Exception): - os.unlink(file_path) - return content, "MARKDOWN" - - if category == FileCategory.DIRECT_CONVERT: - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Converting file", - ) - await task_logger.log_task_progress( - log_entry, - f"Direct-converting file to markdown: {filename}", - {"file_type": "direct_convert", "processing_stage": "converting"}, - ) - content = convert_file_directly(file_path, filename) - with contextlib.suppress(Exception): - os.unlink(file_path) - return content, "DIRECT_CONVERT" - - if category == FileCategory.AUDIO: - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - transcribed_text = await _transcribe_audio(file_path, filename) - with contextlib.suppress(Exception): - os.unlink(file_path) - return transcribed_text, "AUDIO_TRANSCRIPTION" - - # Document file — use ETL service - return await _extract_document_content( - file_path, - filename, - session, - user_id, - task_logger, - log_entry, - notification, + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + from app.etl_pipeline.file_classifier import ( + FileCategory, + classify_file as etl_classify, ) - -async def _transcribe_audio(file_path: str, filename: str) -> str: - """Transcribe an audio file and return formatted markdown text.""" - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - if stt_service_type == "local": - from app.services.stt_service import stt_service - - result = stt_service.transcribe_file(file_path) - text = result.get("text", "") - if not text: - raise ValueError("Transcription returned empty text") - else: - from litellm import atranscription - - with open(file_path, "rb") as audio_file: - kwargs: dict = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - response = await atranscription(**kwargs) - text = response.get("text", "") - if not text: - raise ValueError("Transcription returned empty text") - - return f"# Transcription of {filename}\n\n{text}" - - -async def _extract_document_content( - file_path: str, - filename: str, - session: AsyncSession, - user_id: str, - task_logger: TaskLoggingService, - log_entry: Log, - notification: Notification | None, -) -> tuple[str, str]: - """ - Parse a document file via the configured ETL service. - - Returns: - Tuple of (markdown_content, etl_service_name). - """ - from app.services.page_limit_service import PageLimitService - - page_limit_service = PageLimitService(session) - - try: - estimated_pages = page_limit_service.estimate_pages_before_processing(file_path) - except Exception: - file_size = os.path.getsize(file_path) - estimated_pages = max(1, file_size // (80 * 1024)) - - await page_limit_service.check_page_limit(user_id, estimated_pages) - - etl_service = app_config.ETL_SERVICE - markdown_content: str | None = None + category = etl_classify(filename) + estimated_pages = 0 if notification: + stage_messages = { + FileCategory.PLAINTEXT: "Reading file", + FileCategory.DIRECT_CONVERT: "Converting file", + FileCategory.AUDIO: "Transcribing audio", + FileCategory.UNSUPPORTED: "Unsupported file type", + FileCategory.DOCUMENT: "Extracting content", + } await NotificationService.document_processing.notify_processing_progress( session, notification, stage="parsing", - stage_message="Extracting content", + stage_message=stage_messages.get(category, "Processing"), ) - if etl_service == "UNSTRUCTURED": - from app.utils.document_converters import convert_document_to_markdown + await task_logger.log_task_progress( + log_entry, + f"Processing {category.value} file: {filename}", + {"file_type": category.value, "processing_stage": "extracting"}, + ) - docs = await parse_with_unstructured(file_path) - markdown_content = await convert_document_to_markdown(docs) - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - final_pages = max(estimated_pages, actual_pages) - await page_limit_service.update_page_usage( - user_id, final_pages, allow_exceed=True - ) + if category == FileCategory.DOCUMENT: + from app.services.page_limit_service import PageLimitService - elif etl_service == "LLAMACLOUD": - raw_result = await parse_with_llamacloud_retry( + page_limit_service = PageLimitService(session) + estimated_pages = _estimate_pages_safe(page_limit_service, file_path) + await page_limit_service.check_page_limit(user_id, estimated_pages) + + result = await EtlPipelineService().extract( + EtlRequest( file_path=file_path, + filename=filename, estimated_pages=estimated_pages, - task_logger=task_logger, - log_entry=log_entry, ) - markdown_documents = await raw_result.aget_markdown_documents( - split_by_page=False - ) - if not markdown_documents: - raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}") - markdown_content = markdown_documents[0].text + ) + + if category == FileCategory.DOCUMENT: await page_limit_service.update_page_usage( user_id, estimated_pages, allow_exceed=True ) - elif etl_service == "DOCLING": - getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) - getLogger("docling.document_converter").setLevel(ERROR) - getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel( - ERROR - ) - - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - markdown_content = result.document.export_to_markdown() - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - else: - raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}") - with contextlib.suppress(Exception): os.unlink(file_path) - if not markdown_content: + if not result.markdown_content: raise RuntimeError(f"Failed to extract content from file: {filename}") - return markdown_content, etl_service + return result.markdown_content, result.etl_service async def process_file_in_background_with_document( diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py new file mode 100644 index 000000000..8d432ce56 --- /dev/null +++ b/surfsense_backend/app/utils/file_extensions.py @@ -0,0 +1,124 @@ +"""Per-parser document extension sets for the ETL pipeline. + +Every consumer (file_classifier, connector-level skip checks, ETL pipeline +validation) imports from here so there is a single source of truth. + +Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or +DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these +sets are exclusively for the "document" ETL path (Docling / LlamaParse / +Unstructured). +""" + +from pathlib import PurePosixPath + +# --------------------------------------------------------------------------- +# Per-parser document extension sets (from official documentation) +# --------------------------------------------------------------------------- + +DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".tiff", + ".tif", + ".bmp", + ".webp", + } +) + +LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".docm", + ".dot", + ".dotm", + ".pptm", + ".pot", + ".potx", + ".xlsm", + ".xlsb", + ".xlw", + ".rtf", + ".epub", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".odt", + ".ods", + ".odp", + ".hwp", + ".hwpx", + } +) + +UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".png", + ".jpg", + ".jpeg", + ".bmp", + ".tiff", + ".tif", + ".heic", + ".rtf", + ".epub", + ".odt", + ".eml", + ".msg", + ".p7s", + } +) + +# --------------------------------------------------------------------------- +# Union (used by classify_file for routing) + service lookup +# --------------------------------------------------------------------------- + +DOCUMENT_EXTENSIONS: frozenset[str] = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS +) + +_SERVICE_MAP: dict[str, frozenset[str]] = { + "DOCLING": DOCLING_DOCUMENT_EXTENSIONS, + "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS, + "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS, +} + + +def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]: + """Return the document extensions supported by *etl_service*. + + Falls back to the full union when the service is ``None`` or unknown. + """ + return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS) + + +def is_supported_document_extension(filename: str) -> bool: + """Return True if the file's extension is in the supported document set.""" + suffix = PurePosixPath(filename).suffix.lower() + return suffix in DOCUMENT_EXTENSIONS diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 41c379e58..62f4f6b47 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch): # -- LlamaParse mock (external API) -------------------------------- - class _FakeMarkdownDoc: - def __init__(self, text: str): - self.text = text - - class _FakeLlamaParseResult: - async def aget_markdown_documents(self, *, split_by_page=False): - return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)] - - async def _fake_llamacloud_parse(**kwargs): - _reject_empty(kwargs["file_path"]) - return _FakeLlamaParseResult() + async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str: + _reject_empty(file_path) + return _MOCK_ETL_MARKDOWN monkeypatch.setattr( - "app.tasks.document_processors.file_processors.parse_with_llamacloud_retry", + "app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud", _fake_llamacloud_parse, ) # -- Docling mock (heavy library boundary) ------------------------- - async def _fake_docling_parse(file_path: str, filename: str): + async def _fake_docling_parse(file_path: str, filename: str) -> str: _reject_empty(file_path) return _MOCK_ETL_MARKDOWN monkeypatch.setattr( - "app.tasks.document_processors.file_processors.parse_with_docling", + "app.etl_pipeline.parsers.docling.parse_with_docling", _fake_docling_parse, ) diff --git a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py index 5bb0b6137..e669fa143 100644 --- a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py +++ b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py @@ -124,7 +124,7 @@ async def test_composio_connector_without_account_id_returns_error( maker = make_session_factory(async_engine) async with maker() as session: - count, _skipped, error = await index_google_drive_files( + count, _skipped, error, _unsupported = await index_google_drive_files( session=session, connector_id=data["connector_id"], search_space_id=data["search_space_id"], diff --git a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py new file mode 100644 index 000000000..cd112e09f --- /dev/null +++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py @@ -0,0 +1,244 @@ +"""Tests that each cloud connector's download_and_extract_content correctly +produces markdown from a real file via the unified ETL pipeline. + +Only the cloud client is mocked (system boundary). The ETL pipeline runs for +real so we know the full path from "cloud gives us bytes" to "we get markdown +back" actually works. +""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +pytestmark = pytest.mark.unit + +_TXT_CONTENT = "Hello from the cloud connector test." +_CSV_CONTENT = "name,age\nAlice,30\nBob,25\n" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _write_file(dest_path: str, content: str) -> None: + """Simulate a cloud client writing downloaded bytes to disk.""" + with open(dest_path, "w", encoding="utf-8") as f: + f.write(content) + + +def _make_download_side_effect(content: str): + """Return an async side-effect that writes *content* to the dest path + and returns ``None`` (success).""" + + async def _side_effect(*args): + dest_path = args[-1] + await _write_file(dest_path, content) + return None + + return _side_effect + + +# =================================================================== +# Google Drive +# =================================================================== + + +class TestGoogleDriveContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = {"id": "f1", "name": "notes.txt", "mimeType": "text/plain"} + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["google_drive_file_id"] == "f1" + assert metadata["google_drive_file_name"] == "notes.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"} + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "Bob" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="Network timeout") + + file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"} + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "Network timeout" + + +# =================================================================== +# OneDrive +# =================================================================== + + +class TestOneDriveContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = { + "id": "od-1", + "name": "report.txt", + "file": {"mimeType": "text/plain"}, + } + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["onedrive_file_id"] == "od-1" + assert metadata["onedrive_file_name"] == "report.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = { + "id": "od-2", + "name": "data.csv", + "file": {"mimeType": "text/csv"}, + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="403 Forbidden") + + file = { + "id": "od-3", + "name": "secret.txt", + "file": {"mimeType": "text/plain"}, + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "403 Forbidden" + + +# =================================================================== +# Dropbox +# =================================================================== + + +class TestDropboxContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = { + "id": "dbx-1", + "name": "memo.txt", + ".tag": "file", + "path_lower": "/memo.txt", + } + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["dropbox_file_id"] == "dbx-1" + assert metadata["dropbox_file_name"] == "memo.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = { + "id": "dbx-2", + "name": "data.csv", + ".tag": "file", + "path_lower": "/data.csv", + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="Rate limited") + + file = { + "id": "dbx-3", + "name": "big.txt", + ".tag": "file", + "path_lower": "/big.txt", + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "Rate limited" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index 76f8806dc..f72135d05 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -8,6 +8,10 @@ import pytest from app.db import DocumentType from app.tasks.connector_indexers.dropbox_indexer import ( _download_files_parallel, + _index_full_scan, + _index_selected_files, + _index_with_delta_sync, + index_dropbox_files, ) pytestmark = pytest.mark.unit @@ -234,3 +238,610 @@ async def test_heartbeat_fires_during_parallel_downloads( assert len(docs) == 3 assert failed == 0 assert len(heartbeat_calls) >= 1, "Heartbeat should have fired at least once" + + +# --------------------------------------------------------------------------- +# D1-D2: _index_full_scan tests +# --------------------------------------------------------------------------- + + +def _folder_dict(name: str) -> dict: + return {".tag": "folder", "name": name} + + +@pytest.fixture +def full_scan_mocks(mock_dropbox_client, monkeypatch): + """Wire up mocks for _index_full_scan in isolation.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_session = AsyncMock() + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + mock_log_entry = MagicMock() + + skip_results: dict[str, tuple[bool, str | None]] = {} + + monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD") + + async def _fake_skip(session, file, search_space_id): + from app.connectors.dropbox.file_types import should_skip_file as _skip + + item_skip, unsup_ext = _skip(file) + if item_skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" + return True, "folder/non-downloadable" + return skip_results.get(file.get("id", ""), (False, None)) + + monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + + return { + "dropbox_client": mock_dropbox_client, + "session": mock_session, + "task_logger": mock_task_logger, + "log_entry": mock_log_entry, + "skip_results": skip_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500): + import app.tasks.connector_indexers.dropbox_indexer as _mod + + monkeypatch.setattr( + _mod, + "get_files_in_folder", + AsyncMock(return_value=(page_files, None)), + ) + return await _index_full_scan( + mocks["dropbox_client"], + mocks["session"], + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "", + "Root", + mocks["task_logger"], + mocks["log_entry"], + max_files, + enable_summary=True, + ) + + +async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch): + """Skipped files excluded, renames counted as indexed, new files downloaded.""" + page_files = [ + _folder_dict("SubFolder"), + _make_file_dict("skip1", "unchanged.txt"), + _make_file_dict("rename1", "renamed.txt"), + _make_file_dict("new1", "new1.txt"), + _make_file_dict("new2", "new2.txt"), + ] + + full_scan_mocks["skip_results"]["skip1"] = (True, "unchanged") + full_scan_mocks["skip_results"]["rename1"] = ( + True, + "File renamed: 'old' -> 'renamed.txt'", + ) + + full_scan_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported = await _run_full_scan( + full_scan_mocks, monkeypatch, page_files + ) + + assert indexed == 3 # 1 renamed + 2 from batch + assert skipped == 2 # 1 folder + 1 unchanged + + call_args = full_scan_mocks["download_and_index_mock"].call_args + call_files = call_args[0][2] + assert len(call_files) == 2 + assert {f["id"] for f in call_files} == {"new1", "new2"} + + +async def test_full_scan_respects_max_files(full_scan_mocks, monkeypatch): + """Only max_files non-folder items are considered.""" + page_files = [_make_file_dict(f"f{i}", f"file{i}.txt") for i in range(10)] + + full_scan_mocks["download_and_index_mock"].return_value = (3, 0) + + await _run_full_scan(full_scan_mocks, monkeypatch, page_files, max_files=3) + + call_files = full_scan_mocks["download_and_index_mock"].call_args[0][2] + assert len(call_files) == 3 + + +# --------------------------------------------------------------------------- +# D3-D5: _index_selected_files tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def selected_files_mocks(mock_dropbox_client, monkeypatch): + """Wire up mocks for _index_selected_files tests.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_session = AsyncMock() + + get_file_results: dict[str, tuple[dict | None, str | None]] = {} + + async def _fake_get_file(client, path): + return get_file_results.get(path, (None, f"Not configured: {path}")) + + monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file) + + skip_results: dict[str, tuple[bool, str | None]] = {} + + async def _fake_skip(session, file, search_space_id): + return skip_results.get(file["id"], (False, None)) + + monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + + return { + "dropbox_client": mock_dropbox_client, + "session": mock_session, + "get_file_results": get_file_results, + "skip_results": skip_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_selected(mocks, file_tuples): + return await _index_selected_files( + mocks["dropbox_client"], + mocks["session"], + file_tuples, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, + ) + + +async def test_selected_files_single_file_indexed(selected_files_mocks): + selected_files_mocks["get_file_results"]["/report.pdf"] = ( + _make_file_dict("f1", "report.pdf"), + None, + ) + selected_files_mocks["download_and_index_mock"].return_value = (1, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [("/report.pdf", "report.pdf")], + ) + + assert indexed == 1 + assert skipped == 0 + assert errors == [] + + +async def test_selected_files_fetch_failure_isolation(selected_files_mocks): + selected_files_mocks["get_file_results"]["/first.txt"] = ( + _make_file_dict("f1", "first.txt"), + None, + ) + selected_files_mocks["get_file_results"]["/mid.txt"] = (None, "HTTP 404") + selected_files_mocks["get_file_results"]["/third.txt"] = ( + _make_file_dict("f3", "third.txt"), + None, + ) + selected_files_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [ + ("/first.txt", "first.txt"), + ("/mid.txt", "mid.txt"), + ("/third.txt", "third.txt"), + ], + ) + + assert indexed == 2 + assert skipped == 0 + assert len(errors) == 1 + assert "mid.txt" in errors[0] + + +async def test_selected_files_skip_rename_counting(selected_files_mocks): + for path, fid, fname in [ + ("/unchanged.txt", "s1", "unchanged.txt"), + ("/renamed.txt", "r1", "renamed.txt"), + ("/new1.txt", "n1", "new1.txt"), + ("/new2.txt", "n2", "new2.txt"), + ]: + selected_files_mocks["get_file_results"][path] = ( + _make_file_dict(fid, fname), + None, + ) + + selected_files_mocks["skip_results"]["s1"] = (True, "unchanged") + selected_files_mocks["skip_results"]["r1"] = ( + True, + "File renamed: 'old' -> 'renamed.txt'", + ) + selected_files_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [ + ("/unchanged.txt", "unchanged.txt"), + ("/renamed.txt", "renamed.txt"), + ("/new1.txt", "new1.txt"), + ("/new2.txt", "new2.txt"), + ], + ) + + assert indexed == 3 # 1 renamed + 2 batch + assert skipped == 1 + assert errors == [] + + mock = selected_files_mocks["download_and_index_mock"] + call_files = mock.call_args[0][2] + assert len(call_files) == 2 + assert {f["id"] for f in call_files} == {"n1", "n2"} + + +# --------------------------------------------------------------------------- +# E1-E4: _index_with_delta_sync tests +# --------------------------------------------------------------------------- + + +async def test_delta_sync_deletions_call_remove_document(monkeypatch): + """E1: deleted entries are processed via _remove_document.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + { + ".tag": "deleted", + "name": "gone.txt", + "path_lower": "/gone.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "also_gone.pdf", + "path_lower": "/also_gone.pdf", + "id": "id:del2", + }, + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "new-cursor", None)) + + remove_calls: list[str] = [] + + async def _fake_remove(session, file_id, search_space_id): + remove_calls.append(file_id) + + monkeypatch.setattr(_mod, "_remove_document", _fake_remove) + monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0))) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + _indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert sorted(remove_calls) == ["id:del1", "id:del2"] + assert cursor == "new-cursor" + + +async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch): + """E2: modified/new file entries go through skip filter then download+index.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + _make_file_dict("mod1", "modified1.txt"), + _make_file_dict("mod2", "modified2.txt"), + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None)) + + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=(2, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_mock) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "cursor-v1", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert indexed == 2 + assert skipped == 0 + assert cursor == "cursor-v2" + + downloaded_files = download_mock.call_args[0][2] + assert len(downloaded_files) == 2 + assert {f["id"] for f in downloaded_files} == {"mod1", "mod2"} + + +async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): + """E3: deletions processed, then remaining upserts filtered and indexed.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + { + ".tag": "deleted", + "name": "removed.txt", + "path_lower": "/removed.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "trashed.pdf", + "path_lower": "/trashed.pdf", + "id": "id:del2", + }, + _make_file_dict("mod1", "updated.txt"), + _make_file_dict("new1", "brandnew.docx"), + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "final-cursor", None)) + + remove_calls: list[str] = [] + + async def _fake_remove(session, file_id, search_space_id): + remove_calls.append(file_id) + + monkeypatch.setattr(_mod, "_remove_document", _fake_remove) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=(2, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_mock) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert sorted(remove_calls) == ["id:del1", "id:del2"] + assert indexed == 2 + assert skipped == 0 + assert cursor == "final-cursor" + + downloaded_files = download_mock.call_args[0][2] + assert {f["id"] for f in downloaded_files} == {"mod1", "new1"} + + +async def test_delta_sync_returns_new_cursor(monkeypatch): + """E4: the new cursor from the API response is returned.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=([], "brand-new-cursor-xyz", None)) + + monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0))) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert cursor == "brand-new-cursor-xyz" + assert indexed == 0 + assert skipped == 0 + + +# --------------------------------------------------------------------------- +# F1-F3: index_dropbox_files orchestrator tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def orchestrator_mocks(monkeypatch): + """Wire up mocks for index_dropbox_files orchestrator tests.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_connector = MagicMock() + mock_connector.config = {"_token_encrypted": False} + mock_connector.last_indexed_at = None + mock_connector.enable_summary = True + + monkeypatch.setattr( + _mod, + "get_connector_by_id", + AsyncMock(return_value=mock_connector), + ) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_start = AsyncMock(return_value=MagicMock()) + mock_task_logger.log_task_progress = AsyncMock() + mock_task_logger.log_task_success = AsyncMock() + mock_task_logger.log_task_failure = AsyncMock() + monkeypatch.setattr( + _mod, "TaskLoggingService", MagicMock(return_value=mock_task_logger) + ) + + monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock()) + + full_scan_mock = AsyncMock(return_value=(5, 2, 0)) + monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock) + + delta_sync_mock = AsyncMock(return_value=(3, 1, 0, "delta-cursor-new")) + monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock) + + mock_client = MagicMock() + mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None)) + monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client)) + + return { + "connector": mock_connector, + "full_scan_mock": full_scan_mock, + "delta_sync_mock": delta_sync_mock, + "mock_client": mock_client, + } + + +async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed( + orchestrator_mocks, +): + """F1: with cursor + last_indexed_at + use_delta_sync, calls delta sync.""" + from datetime import UTC, datetime + + connector = orchestrator_mocks["connector"] + connector.config = { + "_token_encrypted": False, + "folder_cursors": {"/docs": "saved-cursor-123"}, + } + connector.last_indexed_at = datetime(2026, 1, 1, tzinfo=UTC) + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + _indexed, _skipped, error, _unsupported = await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + "indexing_options": {"use_delta_sync": True}, + }, + ) + + assert error is None + orchestrator_mocks["delta_sync_mock"].assert_called_once() + orchestrator_mocks["full_scan_mock"].assert_not_called() + + +async def test_orchestrator_falls_back_to_full_scan_without_cursor( + orchestrator_mocks, +): + """F2: without cursor, falls back to full scan.""" + connector = orchestrator_mocks["connector"] + connector.config = {"_token_encrypted": False} + connector.last_indexed_at = None + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + _indexed, _skipped, error, _unsupported = await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + "indexing_options": {"use_delta_sync": True}, + }, + ) + + assert error is None + orchestrator_mocks["full_scan_mock"].assert_called_once() + orchestrator_mocks["delta_sync_mock"].assert_not_called() + + +async def test_orchestrator_persists_cursor_after_sync(orchestrator_mocks): + """F3: after sync, persists new cursor to connector config.""" + connector = orchestrator_mocks["connector"] + connector.config = {"_token_encrypted": False} + connector.last_indexed_at = None + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + }, + ) + + assert "folder_cursors" in connector.config + assert connector.config["folder_cursors"]["/docs"] == "latest-cursor-abc" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py index 20bd3f3d6..0ae096361 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py @@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch): full_scan_mocks["download_mock"].return_value = (mock_docs, 0) full_scan_mocks["batch_mock"].return_value = ([], 2, 0) - indexed, skipped = await _run_full_scan(full_scan_mocks) + indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks) assert indexed == 3 # 1 renamed + 2 from batch assert skipped == 1 # 1 unchanged @@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped = await _index_with_delta_sync( + indexed, skipped, _unsupported = await _index_with_delta_sync( MagicMock(), mock_session, MagicMock(), @@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (1, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "report.pdf")], ) @@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")], ) @@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks): selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [ ("s1", "unchanged.txt"), diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py index b31a9557f..573ee43d8 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py @@ -198,7 +198,7 @@ async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks): ) m["download_and_index_mock"].return_value = (3, 0) - indexed, _skipped, errors = await _run_gdrive_selected( + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")] ) @@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks): None, ) - indexed, _skipped, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")]) + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("big", "huge.pdf")] + ) assert indexed == 0 assert len(errors) == 1 @@ -239,7 +241,7 @@ async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks): ) m["download_and_index_mock"].return_value = (2, 0) - indexed, _skipped, errors = await _run_gdrive_selected( + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")] ) @@ -299,7 +301,7 @@ async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks): None, ) - indexed, _skipped, errors = await _run_gdrive_selected( + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( m, [("f1", "f1.xyz"), ("f2", "f2.xyz")] ) @@ -384,7 +386,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp m["download_mock"].return_value = ([], 0) m["batch_mock"].return_value = ([], 2, 0) - _indexed, skipped = await _run_gdrive_full_scan(m) + _indexed, skipped, _unsup = await _run_gdrive_full_scan(m) call_files = m["download_mock"].call_args[0][1] assert len(call_files) == 2 @@ -459,7 +461,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - _indexed, skipped = await _mod._index_with_delta_sync( + _indexed, skipped, _unsupported = await _mod._index_with_delta_sync( MagicMock(), session, MagicMock(), @@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks): None, ) - indexed, _skipped, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")]) + indexed, _skipped, _unsup, errors = await _run_onedrive_selected( + m, [("big", "huge.pdf")] + ) assert indexed == 0 assert len(errors) == 1 @@ -652,7 +656,7 @@ async def test_dropbox_over_quota_rejected(dropbox_selected_mocks): None, ) - indexed, _skipped, errors = await _run_dropbox_selected( + indexed, _skipped, _unsup, errors = await _run_dropbox_selected( m, [("/huge.pdf", "huge.pdf")] ) diff --git a/surfsense_backend/tests/unit/connectors/__init__.py b/surfsense_backend/tests/unit/connectors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py new file mode 100644 index 000000000..31cafe550 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py @@ -0,0 +1,123 @@ +"""Tests for DropboxClient delta-sync methods (get_latest_cursor, get_changes).""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.connectors.dropbox.client import DropboxClient + +pytestmark = pytest.mark.unit + + +def _make_client() -> DropboxClient: + """Create a DropboxClient with a mocked DB session so no real DB needed.""" + client = DropboxClient.__new__(DropboxClient) + client._session = MagicMock() + client._connector_id = 1 + return client + + +# ---------- C1: get_latest_cursor ---------- + + +async def test_get_latest_cursor_returns_cursor_string(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 200 + fake_resp.json.return_value = {"cursor": "AAHbKxRZ9enq…"} + + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + cursor, error = await client.get_latest_cursor("/my-folder") + + assert cursor == "AAHbKxRZ9enq…" + assert error is None + client._request.assert_called_once_with( + "/2/files/list_folder/get_latest_cursor", + { + "path": "/my-folder", + "recursive": False, + "include_non_downloadable_files": True, + }, + ) + + +# ---------- C2: get_changes returns entries and new cursor ---------- + + +async def test_get_changes_returns_entries_and_cursor(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 200 + fake_resp.json.return_value = { + "entries": [ + {".tag": "file", "name": "new.txt", "id": "id:abc"}, + {".tag": "deleted", "name": "old.txt"}, + ], + "cursor": "cursor-v2", + "has_more": False, + } + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + entries, new_cursor, error = await client.get_changes("cursor-v1") + + assert error is None + assert new_cursor == "cursor-v2" + assert len(entries) == 2 + assert entries[0]["name"] == "new.txt" + assert entries[1][".tag"] == "deleted" + + +# ---------- C3: get_changes handles pagination ---------- + + +async def test_get_changes_handles_pagination(monkeypatch): + client = _make_client() + + page1 = MagicMock() + page1.status_code = 200 + page1.json.return_value = { + "entries": [{".tag": "file", "name": "a.txt", "id": "id:a"}], + "cursor": "cursor-page2", + "has_more": True, + } + page2 = MagicMock() + page2.status_code = 200 + page2.json.return_value = { + "entries": [{".tag": "file", "name": "b.txt", "id": "id:b"}], + "cursor": "cursor-final", + "has_more": False, + } + + request_mock = AsyncMock(side_effect=[page1, page2]) + monkeypatch.setattr(client, "_request", request_mock) + + entries, new_cursor, error = await client.get_changes("cursor-v1") + + assert error is None + assert new_cursor == "cursor-final" + assert len(entries) == 2 + assert {e["name"] for e in entries} == {"a.txt", "b.txt"} + assert request_mock.call_count == 2 + + +# ---------- C4: get_changes raises on 401 ---------- + + +async def test_get_changes_returns_error_on_401(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 401 + fake_resp.text = "Unauthorized" + + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + entries, new_cursor, error = await client.get_changes("old-cursor") + + assert error is not None + assert "401" in error + assert entries == [] + assert new_cursor is None diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py new file mode 100644 index 000000000..b4715e083 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py @@ -0,0 +1,173 @@ +"""Tests for Dropbox file type filtering (should_skip_file).""" + +import pytest + +from app.connectors.dropbox.file_types import should_skip_file + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + +def test_folder_item_is_skipped(): + item = {".tag": "folder", "name": "My Folder"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_paper_file_is_not_skipped(): + item = {".tag": "file", "name": "notes.paper", "is_downloadable": False} + skip, ext = should_skip_file(item) + assert skip is False + assert ext is None + + +def test_non_downloadable_item_is_skipped(): + item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename", + [ + "archive.zip", + "backup.tar", + "data.gz", + "stuff.rar", + "pack.7z", + "program.exe", + "lib.dll", + "module.so", + "image.dmg", + "disk.iso", + "movie.mov", + "clip.avi", + "video.mkv", + "film.wmv", + "stream.flv", + "favicon.ico", + "raw.cr2", + "photo.nef", + "image.arw", + "pic.dng", + "design.psd", + "vector.ai", + "mockup.sketch", + "proto.fig", + "font.ttf", + "font.otf", + "font.woff", + "font.woff2", + "model.stl", + "scene.fbx", + "mesh.blend", + "local.db", + "data.sqlite", + "access.mdb", + ], +) +def test_non_parseable_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is True, f"{filename} should be skipped" + assert ext is not None + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "document.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "page.html", + "notes.md", + "config.json", + "feed.xml", + ], +) +def test_parseable_documents_are_not_skipped(filename, mocker): + """Files in plaintext/direct_convert/universal document sets are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename", + ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"], +) +def test_universal_images_are_not_skipped(filename, mocker): + """Images supported by all parsers are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("old.doc", "DOCLING", True), + ("old.doc", "LLAMACLOUD", False), + ("old.doc", "UNSTRUCTURED", False), + ("legacy.xls", "DOCLING", True), + ("legacy.xls", "LLAMACLOUD", False), + ("legacy.xls", "UNSTRUCTURED", False), + ("deck.ppt", "DOCLING", True), + ("deck.ppt", "LLAMACLOUD", False), + ("deck.ppt", "UNSTRUCTURED", False), + ("icon.svg", "DOCLING", True), + ("icon.svg", "LLAMACLOUD", False), + ("anim.gif", "DOCLING", True), + ("anim.gif", "LLAMACLOUD", False), + ("photo.webp", "DOCLING", False), + ("photo.webp", "LLAMACLOUD", False), + ("photo.webp", "UNSTRUCTURED", True), + ("live.heic", "DOCLING", True), + ("live.heic", "UNSTRUCTURED", False), + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped due to unsupported extension, the ext string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {".tag": "file", "name": "old.doc"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext == ".doc" diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py new file mode 100644 index 000000000..85281354c --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py @@ -0,0 +1,43 @@ +"""Test that Dropbox re-auth preserves folder_cursors in connector config.""" + +import pytest + +pytestmark = pytest.mark.unit + + +def test_reauth_preserves_folder_cursors(): + """G1: re-authentication preserves folder_cursors alongside cursor.""" + old_config = { + "access_token": "old-token-enc", + "refresh_token": "old-refresh-enc", + "cursor": "old-cursor-abc", + "folder_cursors": {"/docs": "cursor-docs-123", "/photos": "cursor-photos-456"}, + "_token_encrypted": True, + "auth_expired": True, + } + + new_connector_config = { + "access_token": "new-token-enc", + "refresh_token": "new-refresh-enc", + "token_type": "bearer", + "expires_in": 14400, + "expires_at": "2026-04-06T16:00:00+00:00", + "_token_encrypted": True, + } + + existing_cursor = old_config.get("cursor") + existing_folder_cursors = old_config.get("folder_cursors") + merged_config = { + **new_connector_config, + "cursor": existing_cursor, + "folder_cursors": existing_folder_cursors, + "auth_expired": False, + } + + assert merged_config["access_token"] == "new-token-enc" + assert merged_config["cursor"] == "old-cursor-abc" + assert merged_config["folder_cursors"] == { + "/docs": "cursor-docs-123", + "/photos": "cursor-photos-456", + } + assert merged_config["auth_expired"] is False diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py new file mode 100644 index 000000000..ab602468d --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py @@ -0,0 +1,80 @@ +"""Tests for Google Drive file type filtering.""" + +import pytest + +from app.connectors.google_drive.file_types import should_skip_by_extension + +pytestmark = pytest.mark.unit + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) +def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker): + """Truly unsupported files are skipped no matter which ETL service is configured.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, _ext = should_skip_by_extension(filename) + assert skip is True + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) +def test_universal_extensions_are_not_skipped(filename, mocker): + """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, ext = should_skip_by_extension(filename) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.gif", "DOCLING", True), + ("photo.gif", "LLAMACLOUD", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, ext = should_skip_by_extension(filename) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None, "unsupported extension should be returned" + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped, the unsupported extension string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + skip, ext = should_skip_by_extension("macro.docm") + assert skip is True + assert ext == ".docm" diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py new file mode 100644 index 000000000..1d9124c47 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py @@ -0,0 +1,118 @@ +"""Tests for OneDrive file type filtering.""" + +import pytest + +from app.connectors.onedrive.file_types import should_skip_file + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + +def test_folder_is_skipped(): + item = {"folder": {}, "name": "My Folder"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_remote_item_is_skipped(): + item = {"remoteItem": {}, "name": "shared.docx"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_package_is_skipped(): + item = {"package": {}, "name": "notebook"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_onenote_is_skipped(): + item = {"name": "notes", "file": {"mimeType": "application/msonenote"}} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) +def test_unsupported_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is True, f"{filename} should be skipped" + assert ext is not None + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) +def test_universal_files_are_not_skipped(filename, mocker): + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped due to unsupported extension, the ext string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is True + assert ext == ".eml" diff --git a/surfsense_backend/tests/unit/etl_pipeline/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/conftest.py new file mode 100644 index 000000000..082ab9771 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py @@ -0,0 +1,27 @@ +"""Pre-register the etl_pipeline package to avoid circular imports during unit tests.""" + +import sys +import types +from pathlib import Path + +_BACKEND = Path(__file__).resolve().parents[3] + + +def _stub_package(dotted: str, fs_dir: Path) -> None: + if dotted not in sys.modules: + mod = types.ModuleType(dotted) + mod.__path__ = [str(fs_dir)] + mod.__package__ = dotted + sys.modules[dotted] = mod + + parts = dotted.split(".") + if len(parts) > 1: + parent_dotted = ".".join(parts[:-1]) + parent = sys.modules.get(parent_dotted) + if parent is not None: + setattr(parent, parts[-1], sys.modules[dotted]) + + +_stub_package("app", _BACKEND / "app") +_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline") +_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers") diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py new file mode 100644 index 000000000..769b1dc53 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -0,0 +1,461 @@ +"""Tests for EtlPipelineService -- the unified ETL pipeline public interface.""" + +import pytest + +from app.etl_pipeline.etl_document import EtlRequest +from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + +pytestmark = pytest.mark.unit + + +async def test_extract_txt_file_returns_markdown(tmp_path): + """Tracer bullet: a .txt file is read and returned as-is in an EtlResult.""" + txt_file = tmp_path / "hello.txt" + txt_file.write_text("Hello, world!", encoding="utf-8") + + service = EtlPipelineService() + result = await service.extract( + EtlRequest(file_path=str(txt_file), filename="hello.txt") + ) + + assert result.markdown_content == "Hello, world!" + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_md_file(tmp_path): + """A .md file is classified as PLAINTEXT and extracted.""" + md_file = tmp_path / "readme.md" + md_file.write_text("# Title\n\nBody text.", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(md_file), filename="readme.md") + ) + + assert result.markdown_content == "# Title\n\nBody text." + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_markdown_file(tmp_path): + """A .markdown file is classified as PLAINTEXT and extracted.""" + md_file = tmp_path / "notes.markdown" + md_file.write_text("Some notes.", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(md_file), filename="notes.markdown") + ) + + assert result.markdown_content == "Some notes." + assert result.etl_service == "PLAINTEXT" + + +async def test_extract_python_file(tmp_path): + """A .py source code file is classified as PLAINTEXT.""" + py_file = tmp_path / "script.py" + py_file.write_text("print('hello')", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(py_file), filename="script.py") + ) + + assert result.markdown_content == "print('hello')" + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_js_file(tmp_path): + """A .js source code file is classified as PLAINTEXT.""" + js_file = tmp_path / "app.js" + js_file.write_text("console.log('hi');", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(js_file), filename="app.js") + ) + + assert result.markdown_content == "console.log('hi');" + assert result.etl_service == "PLAINTEXT" + + +async def test_extract_csv_returns_markdown_table(tmp_path): + """A .csv file is converted to a markdown table.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("name,age\nAlice,30\nBob,25\n", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(csv_file), filename="data.csv") + ) + + assert "| name | age |" in result.markdown_content + assert "| Alice | 30 |" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + assert result.content_type == "direct_convert" + + +async def test_extract_tsv_returns_markdown_table(tmp_path): + """A .tsv file is converted to a markdown table.""" + tsv_file = tmp_path / "data.tsv" + tsv_file.write_text("x\ty\n1\t2\n", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(tsv_file), filename="data.tsv") + ) + + assert "| x | y |" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + + +async def test_extract_html_returns_markdown(tmp_path): + """An .html file is converted to markdown.""" + html_file = tmp_path / "page.html" + html_file.write_text("

Title

Body

", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(html_file), filename="page.html") + ) + + assert "Title" in result.markdown_content + assert "Body" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + + +async def test_extract_mp3_returns_transcription(tmp_path, mocker): + """An .mp3 audio file is transcribed via litellm.atranscription.""" + audio_file = tmp_path / "recording.mp3" + audio_file.write_bytes(b"\x00" * 100) + + mocker.patch("app.config.config.STT_SERVICE", "openai/whisper-1") + mocker.patch("app.config.config.STT_SERVICE_API_KEY", "fake-key") + mocker.patch("app.config.config.STT_SERVICE_API_BASE", None) + + mock_transcription = mocker.patch( + "app.etl_pipeline.parsers.audio.atranscription", + return_value={"text": "Hello from audio"}, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(audio_file), filename="recording.mp3") + ) + + assert "Hello from audio" in result.markdown_content + assert result.etl_service == "AUDIO" + assert result.content_type == "audio" + mock_transcription.assert_called_once() + + +# --------------------------------------------------------------------------- +# Slice 7 - DOCLING document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_docling(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=DOCLING returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake") + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "# Parsed PDF"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + assert result.markdown_content == "# Parsed PDF" + assert result.etl_service == "DOCLING" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 8 - UNSTRUCTURED document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_unstructured(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=UNSTRUCTURED returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake") + + mocker.patch("app.config.config.ETL_SERVICE", "UNSTRUCTURED") + + class FakeDoc: + def __init__(self, text): + self.page_content = text + + fake_loader_instance = mocker.AsyncMock() + fake_loader_instance.aload.return_value = [ + FakeDoc("Page 1 content"), + FakeDoc("Page 2 content"), + ] + mocker.patch( + "langchain_unstructured.UnstructuredLoader", + return_value=fake_loader_instance, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + assert "Page 1 content" in result.markdown_content + assert "Page 2 content" in result.markdown_content + assert result.etl_service == "UNSTRUCTURED" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 9 - LLAMACLOUD document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_llamacloud(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=LLAMACLOUD returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) + + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + + class FakeDoc: + text = "# LlamaCloud parsed" + + class FakeJobResult: + pages = [] + + def get_markdown_documents(self, split_by_page=True): + return [FakeDoc()] + + fake_parser = mocker.AsyncMock() + fake_parser.aparse.return_value = FakeJobResult() + mocker.patch( + "llama_cloud_services.LlamaParse", + return_value=fake_parser, + ) + mocker.patch( + "llama_cloud_services.parse.utils.ResultType", + mocker.MagicMock(MD="md"), + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5) + ) + + assert result.markdown_content == "# LlamaCloud parsed" + assert result.etl_service == "LLAMACLOUD" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 10 - unknown extension falls through to document ETL +# --------------------------------------------------------------------------- + + +async def test_unknown_extension_uses_document_etl(tmp_path, mocker): + """An allowlisted document extension (.docx) routes to the document ETL path.""" + docx_file = tmp_path / "doc.docx" + docx_file.write_bytes(b"PK fake docx") + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "Docx content"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(docx_file), filename="doc.docx") + ) + + assert result.markdown_content == "Docx content" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 11 - EtlRequest validation +# --------------------------------------------------------------------------- + + +def test_etl_request_requires_filename(): + """EtlRequest rejects missing filename.""" + with pytest.raises(ValueError, match="filename must not be empty"): + EtlRequest(file_path="/tmp/some.txt", filename="") + + +# --------------------------------------------------------------------------- +# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError +# --------------------------------------------------------------------------- + + +async def test_unknown_etl_service_raises(tmp_path, mocker): + """An unknown ETL_SERVICE raises EtlServiceUnavailableError.""" + from app.etl_pipeline.exceptions import EtlServiceUnavailableError + + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF fake") + + mocker.patch("app.config.config.ETL_SERVICE", "NONEXISTENT") + + with pytest.raises(EtlServiceUnavailableError, match="Unknown ETL_SERVICE"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + +# --------------------------------------------------------------------------- +# Slice 13 - unsupported file types are rejected before reaching any parser +# --------------------------------------------------------------------------- + + +def test_unknown_extension_classified_as_unsupported(): + """An unknown extension defaults to UNSUPPORTED (allowlist behaviour).""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + assert classify_file("random.xyz") == FileCategory.UNSUPPORTED + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "data.parquet", + "package.deb", + "firmware.bin", + ], +) +def test_unsupported_extensions_classified_correctly(filename): + """Extensions not in any allowlist are classified as UNSUPPORTED.""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + assert classify_file(filename) == FileCategory.UNSUPPORTED + + +@pytest.mark.parametrize( + "filename,expected", + [ + ("report.pdf", "document"), + ("doc.docx", "document"), + ("slides.pptx", "document"), + ("sheet.xlsx", "document"), + ("photo.png", "document"), + ("photo.jpg", "document"), + ("book.epub", "document"), + ("letter.odt", "document"), + ("readme.md", "plaintext"), + ("data.csv", "direct_convert"), + ], +) +def test_parseable_extensions_classified_correctly(filename, expected): + """Parseable files are classified into their correct category.""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + result = classify_file(filename) + assert result != FileCategory.UNSUPPORTED + assert result.value == expected + + +async def test_extract_unsupported_file_raises_error(tmp_path): + """EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + exe_file = tmp_path / "program.exe" + exe_file.write_bytes(b"\x00" * 10) + + with pytest.raises(EtlUnsupportedFileError, match="not supported"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(exe_file), filename="program.exe") + ) + + +async def test_extract_zip_raises_unsupported_error(tmp_path): + """EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + zip_file = tmp_path / "archive.zip" + zip_file.write_bytes(b"PK\x03\x04") + + with pytest.raises(EtlUnsupportedFileError, match="not supported"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(zip_file), filename="archive.zip") + ) + + +# --------------------------------------------------------------------------- +# Slice 14 - should_skip_for_service (per-parser document filtering) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename,etl_service,expected_skip", + [ + ("file.eml", "DOCLING", True), + ("file.eml", "UNSTRUCTURED", False), + ("file.docm", "LLAMACLOUD", False), + ("file.docm", "DOCLING", True), + ("file.txt", "DOCLING", False), + ("file.csv", "LLAMACLOUD", False), + ("file.mp3", "UNSTRUCTURED", False), + ("file.exe", "LLAMACLOUD", True), + ("file.pdf", "DOCLING", False), + ("file.webp", "DOCLING", False), + ("file.webp", "UNSTRUCTURED", True), + ("file.gif", "LLAMACLOUD", False), + ("file.gif", "DOCLING", True), + ("file.heic", "UNSTRUCTURED", False), + ("file.heic", "DOCLING", True), + ("file.svg", "LLAMACLOUD", False), + ("file.svg", "DOCLING", True), + ("file.p7s", "UNSTRUCTURED", False), + ("file.p7s", "LLAMACLOUD", True), + ], +) +def test_should_skip_for_service(filename, etl_service, expected_skip): + from app.etl_pipeline.file_classifier import should_skip_for_service + + assert should_skip_for_service(filename, etl_service) is expected_skip, ( + f"{filename} with {etl_service}: expected skip={expected_skip}" + ) + + +# --------------------------------------------------------------------------- +# Slice 14b - ETL pipeline rejects per-parser incompatible documents +# --------------------------------------------------------------------------- + + +async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .docm -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + docm_file = tmp_path / "macro.docm" + docm_file.write_bytes(b"\x00" * 10) + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(docm_file), filename="macro.docm") + ) + + +async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .eml -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + eml_file = tmp_path / "mail.eml" + eml_file.write_bytes(b"From: test@example.com") + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(eml_file), filename="mail.eml") + ) diff --git a/surfsense_backend/tests/unit/services/__init__.py b/surfsense_backend/tests/unit/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py new file mode 100644 index 000000000..11ffc0ed1 --- /dev/null +++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py @@ -0,0 +1,70 @@ +"""Test that DoclingService does NOT restrict allowed_formats, letting Docling +accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.).""" + +from enum import Enum +from unittest.mock import MagicMock, patch + +import pytest + +pytestmark = pytest.mark.unit + + +class _FakeInputFormat(Enum): + PDF = "pdf" + IMAGE = "image" + DOCX = "docx" + PPTX = "pptx" + XLSX = "xlsx" + + +def test_docling_service_does_not_restrict_allowed_formats(): + """DoclingService should NOT pass allowed_formats to DocumentConverter, + so Docling defaults to accepting every InputFormat it supports.""" + + mock_converter_cls = MagicMock() + mock_backend = MagicMock() + + fake_pipeline_options_cls = MagicMock() + fake_pipeline_options = MagicMock() + fake_pipeline_options_cls.return_value = fake_pipeline_options + + fake_pdf_format_option_cls = MagicMock() + + with patch.dict( + "sys.modules", + { + "docling": MagicMock(), + "docling.backend": MagicMock(), + "docling.backend.pypdfium2_backend": MagicMock( + PyPdfiumDocumentBackend=mock_backend + ), + "docling.datamodel": MagicMock(), + "docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat), + "docling.datamodel.pipeline_options": MagicMock( + PdfPipelineOptions=fake_pipeline_options_cls + ), + "docling.document_converter": MagicMock( + DocumentConverter=mock_converter_cls, + PdfFormatOption=fake_pdf_format_option_cls, + ), + }, + ): + from importlib import reload + + import app.services.docling_service as mod + + reload(mod) + + mod.DoclingService() + + call_kwargs = mock_converter_cls.call_args + assert call_kwargs is not None, "DocumentConverter was never called" + + _, kwargs = call_kwargs + assert "allowed_formats" not in kwargs, ( + f"allowed_formats should not be passed — let Docling accept all formats. " + f"Got: {kwargs.get('allowed_formats')}" + ) + assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), ( + "format_options should still configure PDF pipeline options" + ) diff --git a/surfsense_backend/tests/unit/utils/__init__.py b/surfsense_backend/tests/unit/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py new file mode 100644 index 000000000..c33b39f05 --- /dev/null +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -0,0 +1,154 @@ +"""Tests for the DOCUMENT_EXTENSIONS allowlist module.""" + +import pytest + +pytestmark = pytest.mark.unit + + +def test_pdf_is_supported_document(): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension("report.pdf") is True + + +def test_exe_is_not_supported_document(): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension("malware.exe") is False + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "old.doc", + "sheet.xlsx", + "legacy.xls", + "slides.pptx", + "deck.ppt", + "macro.docm", + "macro.xlsm", + "macro.pptm", + "photo.png", + "photo.jpg", + "photo.jpeg", + "scan.bmp", + "scan.tiff", + "scan.tif", + "photo.webp", + "anim.gif", + "iphone.heic", + "manual.rtf", + "book.epub", + "letter.odt", + "data.ods", + "presentation.odp", + "inbox.eml", + "outlook.msg", + "korean.hwpx", + "korean.hwp", + "template.dot", + "template.dotm", + "template.pot", + "template.potx", + "binary.xlsb", + "workspace.xlw", + "vector.svg", + "signature.p7s", + ], +) +def test_document_extensions_are_supported(filename): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension(filename) is True, ( + f"{filename} should be supported" + ) + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "random.xyz", + "data.parquet", + "package.deb", + ], +) +def test_non_document_extensions_are_not_supported(filename): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension(filename) is False, ( + f"{filename} should NOT be supported" + ) + + +# --------------------------------------------------------------------------- +# Per-parser extension sets +# --------------------------------------------------------------------------- + + +def test_union_equals_all_three_sets(): + from app.utils.file_extensions import ( + DOCLING_DOCUMENT_EXTENSIONS, + DOCUMENT_EXTENSIONS, + LLAMAPARSE_DOCUMENT_EXTENSIONS, + UNSTRUCTURED_DOCUMENT_EXTENSIONS, + ) + + expected = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS + ) + assert expected == DOCUMENT_EXTENSIONS + + +def test_get_extensions_for_docling(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("DOCLING") + assert ".pdf" in exts + assert ".webp" in exts + assert ".docx" in exts + assert ".eml" not in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_llamacloud(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("LLAMACLOUD") + assert ".docm" in exts + assert ".gif" in exts + assert ".svg" in exts + assert ".hwp" in exts + assert ".eml" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_unstructured(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("UNSTRUCTURED") + assert ".eml" in exts + assert ".heic" in exts + assert ".p7s" in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".svg" not in exts + + +def test_get_extensions_for_none_returns_union(): + from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, + ) + + assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index b85af13b7..a795b61c7 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -8,6 +8,7 @@ import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; import { Input } from "@/components/ui/input"; import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover"; +import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon"; @@ -63,109 +64,113 @@ export function DocumentsFilters({ return (
- {/* Type Filter */} - - - - - -
- {/* Search input */} -
-
- - setTypeSearchQuery(e.target.value)} - className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none" - /> -
-
+ {/* Filter + New Folder Toggle Group */} + + {onCreateFolder && ( + + + { + e.preventDefault(); + onCreateFolder(); + }} + > + + + + New folder + + )} -
- {filteredTypes.length === 0 ? ( -
- No types found -
- ) : ( - filteredTypes.map((value: DocumentTypeEnum, i) => ( -
onToggleType(value, !activeTypes.includes(value))} - onKeyDown={(e) => { - if (e.key === "Enter" || e.key === " ") { - e.preventDefault(); - onToggleType(value, !activeTypes.includes(value)); - } - }} - > - {/* Icon */} -
- {getDocumentTypeIcon(value, "h-4 w-4")} -
- {/* Text content */} -
- - {getDocumentTypeLabel(value)} - - - {typeCounts.get(value)} document - {(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""} - -
- {/* Checkbox */} - onToggleType(value, !!checked)} - className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary" - /> -
- )) - )} -
- {activeTypes.length > 0 && ( -
- + + {activeTypes.length > 0 && ( + + {activeTypes.length} + + )} + + + + Filter by type + + +
+
+
+ + setTypeSearchQuery(e.target.value)} + className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none" + /> +
- )} -
-
- + +
+ {filteredTypes.length === 0 ? ( +
+ No types found +
+ ) : ( + filteredTypes.map((value: DocumentTypeEnum, i) => ( +
onToggleType(value, !activeTypes.includes(value))} + onKeyDown={(e) => { + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onToggleType(value, !activeTypes.includes(value)); + } + }} + > +
+ {getDocumentTypeIcon(value, "h-4 w-4")} +
+
+ + {getDocumentTypeLabel(value)} + + + {typeCounts.get(value)} document + {(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""} + +
+ onToggleType(value, !!checked)} + className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary" + /> +
+ )) + )} +
+
+ + +
{/* Search Input */}
@@ -197,23 +202,6 @@ export function DocumentsFilters({ )}
- {/* New Folder Button */} - {onCreateFolder && ( - - - - - New folder - - )} - {/* Upload Button */}
- +
diff --git a/surfsense_web/app/desktop/permissions/page.tsx b/surfsense_web/app/desktop/permissions/page.tsx index 178b6a533..b636fcd7c 100644 --- a/surfsense_web/app/desktop/permissions/page.tsx +++ b/surfsense_web/app/desktop/permissions/page.tsx @@ -1,7 +1,7 @@ "use client"; -import { useEffect, useState } from "react"; import { useRouter } from "next/navigation"; +import { useEffect, useState } from "react"; import { Logo } from "@/components/Logo"; import { Button } from "@/components/ui/button"; import { Spinner } from "@/components/ui/spinner"; @@ -18,7 +18,8 @@ const STEPS = [ { id: "screen-recording", title: "Screen Recording", - description: "Lets SurfSense capture your screen to understand context and provide smart writing suggestions.", + description: + "Lets SurfSense capture your screen to understand context and provide smart writing suggestions.", action: "requestScreenRecording", field: "screenRecording" as const, }, @@ -98,7 +99,8 @@ export default function DesktopPermissionsPage() { ); } - const allGranted = permissions.accessibility === "authorized" && permissions.screenRecording === "authorized"; + const allGranted = + permissions.accessibility === "authorized" && permissions.screenRecording === "authorized"; const handleRequest = async (action: string) => { if (action === "requestScreenRecording") { @@ -175,7 +177,8 @@ export default function DesktopPermissionsPage() {

)}

- If SurfSense doesn't appear in the list, click + and select it from Applications. + If SurfSense doesn't appear in the list, click + and + select it from Applications.

)} diff --git a/surfsense_web/app/desktop/suggestion/layout.tsx b/surfsense_web/app/desktop/suggestion/layout.tsx index 36b7e037b..fd8faf099 100644 --- a/surfsense_web/app/desktop/suggestion/layout.tsx +++ b/surfsense_web/app/desktop/suggestion/layout.tsx @@ -4,10 +4,6 @@ export const metadata = { title: "SurfSense Suggestion", }; -export default function SuggestionLayout({ - children, -}: { - children: React.ReactNode; -}) { +export default function SuggestionLayout({ children }: { children: React.ReactNode }) { return
{children}
; } diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx index 42ce025a8..587bee9db 100644 --- a/surfsense_web/app/desktop/suggestion/page.tsx +++ b/surfsense_web/app/desktop/suggestion/page.tsx @@ -103,27 +103,23 @@ export default function SuggestionPage() { return; } - const backendUrl = - process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"; + const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"; try { - const response = await fetch( - `${backendUrl}/api/v1/autocomplete/vision/stream`, - { - method: "POST", - headers: { - Authorization: `Bearer ${token}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - screenshot, - search_space_id: parseInt(searchSpaceId, 10), - app_name: appName || "", - window_title: windowTitle || "", - }), - signal: controller.signal, + const response = await fetch(`${backendUrl}/api/v1/autocomplete/vision/stream`, { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "Content-Type": "application/json", }, - ); + body: JSON.stringify({ + screenshot, + search_space_id: parseInt(searchSpaceId, 10), + app_name: appName || "", + window_title: windowTitle || "", + }), + signal: controller.signal, + }); if (!response.ok) { setError(friendlyError(response.status)); @@ -174,9 +170,7 @@ export default function SuggestionPage() { return [...prev, { id, title, status, items }]; }); } - } catch { - continue; - } + } catch {} } } } @@ -187,7 +181,7 @@ export default function SuggestionPage() { setIsLoading(false); } }, - [], + [] ); useEffect(() => { @@ -269,10 +263,18 @@ export default function SuggestionPage() {

{suggestion}

- -
diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css index d2213fefd..57a7fd4f4 100644 --- a/surfsense_web/app/desktop/suggestion/suggestion.css +++ b/surfsense_web/app/desktop/suggestion/suggestion.css @@ -1,21 +1,21 @@ html:has(.suggestion-body), body:has(.suggestion-body) { - margin: 0 !important; - padding: 0 !important; - background: transparent !important; - overflow: hidden !important; - height: auto !important; - width: 100% !important; + margin: 0 !important; + padding: 0 !important; + background: transparent !important; + overflow: hidden !important; + height: auto !important; + width: 100% !important; } .suggestion-body { - margin: 0; - padding: 0; - background: transparent; - font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; - -webkit-font-smoothing: antialiased; - user-select: none; - -webkit-app-region: no-drag; + margin: 0; + padding: 0; + background: transparent; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + -webkit-font-smoothing: antialiased; + user-select: none; + -webkit-app-region: no-drag; } .suggestion-tooltip { @@ -75,44 +75,46 @@ body:has(.suggestion-body) { } .suggestion-btn { - padding: 2px 8px; - border-radius: 3px; - border: 1px solid #3c3c3c; - font-family: inherit; - font-size: 10px; - font-weight: 500; - cursor: pointer; - line-height: 16px; - transition: background 0.15s, border-color 0.15s; + padding: 2px 8px; + border-radius: 3px; + border: 1px solid #3c3c3c; + font-family: inherit; + font-size: 10px; + font-weight: 500; + cursor: pointer; + line-height: 16px; + transition: + background 0.15s, + border-color 0.15s; } .suggestion-btn-accept { - background: #2563eb; - border-color: #3b82f6; - color: #fff; + background: #2563eb; + border-color: #3b82f6; + color: #fff; } .suggestion-btn-accept:hover { - background: #1d4ed8; + background: #1d4ed8; } .suggestion-btn-dismiss { - background: #2a2a2a; - color: #999; + background: #2a2a2a; + color: #999; } .suggestion-btn-dismiss:hover { - background: #333; - color: #ccc; + background: #333; + color: #ccc; } .suggestion-error { - border-color: #5c2626; + border-color: #5c2626; } .suggestion-error-text { - color: #f48771; - font-size: 12px; + color: #f48771; + font-size: 12px; } /* --- Agent activity indicator --- */ diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index fe7b29f28..c41e986d4 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -216,7 +216,7 @@ export const ConnectorIndicator = forwardRef { if (pickerOpen) e.preventDefault(); }} - className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5 select-none" + className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button>svg]:size-5 select-none" > Manage Connectors {/* YouTube Crawler View - shown when adding YouTube videos */} diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx index 596b98e93..8a0ef5ae1 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx @@ -144,18 +144,14 @@ export const ConnectorConnectView: FC = ({ type="button" onClick={handleFormSubmit} disabled={isSubmitting} - className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none" + className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none" > - {isSubmitting ? ( - <> - - Connecting - - ) : connectorType === "MCP_CONNECTOR" ? ( - "Connect" - ) : ( - `Connect ${getConnectorTypeDisplay(connectorType)}` - )} + + {connectorType === "MCP_CONNECTOR" + ? "Connect" + : `Connect ${getConnectorTypeDisplay(connectorType)}`} + + {isSubmitting && }
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 1e71b0a25..7308e1e26 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -369,16 +369,10 @@ export const ConnectorEditView: FC = ({ size="sm" onClick={handleDisconnectConfirm} disabled={isDisconnecting} - className="text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2" + className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2" > - {isDisconnecting ? ( - <> - - Disconnecting - - ) : ( - "Confirm Disconnect" - )} + Confirm Disconnect + {isDisconnecting && } )} diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx index ccf1476dd..55fc99150 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/active-connectors-tab.tsx @@ -1,6 +1,6 @@ "use client"; -import { Cable } from "lucide-react"; +import { Search, Unplug } from "lucide-react"; import type { FC } from "react"; import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon"; import { Button } from "@/components/ui/button"; @@ -134,9 +134,17 @@ export const ActiveConnectorsTab: FC = ({ const hasActiveConnectors = filteredOAuthConnectorTypes.length > 0 || filteredNonOAuthConnectors.length > 0; + const hasFilteredResults = hasActiveConnectors || standaloneDocuments.length > 0; + return ( - {hasSources ? ( + {hasSources && !hasFilteredResults && searchQuery ? ( +
+ +

No connectors found

+

Try a different search term

+
+ ) : hasSources ? (
{/* Active Connectors Section */} {hasActiveConnectors && ( @@ -302,7 +310,7 @@ export const ActiveConnectorsTab: FC = ({ ) : (
- +

No active sources

diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index 4a97863fb..bbbf6dd57 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -1,5 +1,6 @@ "use client"; +import { Search } from "lucide-react"; import type { FC } from "react"; import { EnumConnectorName } from "@/contracts/enums/connector"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; @@ -287,6 +288,18 @@ export const AllConnectorsTab: FC = ({ moreIntegrationsOther.length > 0 || moreIntegrationsCrawlers.length > 0; + const hasAnyResults = hasDocumentFileConnectors || hasMoreIntegrations; + + if (!hasAnyResults && searchQuery) { + return ( +

+ +

No connectors found

+

Try a different search term

+
+ ); + } + return (
{/* Document/Files Connectors */} diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx index 5dfc252c2..b4c049c5c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx @@ -173,9 +173,7 @@ export const ConnectorAccountsListView: FC = ({ )}
- - {isConnecting ? "Connecting" : buttonText} - + {buttonText}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx index 7ec85f4d3..8982b16a8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx @@ -335,16 +335,10 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx index 59d73e651..0b38979a5 100644 --- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx +++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx @@ -125,18 +125,16 @@ const DocumentUploadPopupContent: FC<{ onPointerDownOutside={(e) => e.preventDefault()} onInteractOutside={(e) => e.preventDefault()} onEscapeKeyDown={(e) => e.preventDefault()} - className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5" + className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5" > Upload Document
-
+
-

- Upload Documents -

+

Upload Documents

-

+

Upload and sync your documents to your search space

diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx index c147eede4..59781abcf 100644 --- a/surfsense_web/components/assistant-ui/image.tsx +++ b/surfsense_web/components/assistant-ui/image.tsx @@ -3,10 +3,10 @@ import type { ImageMessagePartComponent } from "@assistant-ui/react"; import { cva, type VariantProps } from "class-variance-authority"; import { ImageIcon, ImageOffIcon } from "lucide-react"; +import NextImage from "next/image"; import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react"; import { createPortal } from "react-dom"; import { cn } from "@/lib/utils"; -import NextImage from 'next/image'; const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", { variants: { @@ -88,23 +88,23 @@ function ImagePreview({
) : isDataOrBlobUrl(src) ? ( - // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img - {alt} { - if (typeof src === "string") setLoadedSrc(src); - onLoad?.(e); - }} - onError={(e) => { - if (typeof src === "string") setErrorSrc(src); - onError?.(e); - }} - {...props} - /> - ) : ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + {alt} { + if (typeof src === "string") setLoadedSrc(src); + onLoad?.(e); + }} + onError={(e) => { + if (typeof src === "string") setErrorSrc(src); + onError?.(e); + }} + {...props} + /> + ) : ( // biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs // { - if (typeof src === "string") setLoadedSrc(src); - onLoad?.(); - }} - onError={() => { - if (typeof src === "string") setErrorSrc(src); - onError?.(); - }} - unoptimized={false} - {...props} - /> + fill + src={src || ""} + alt={alt} + sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw" + className={cn("block object-contain", !loaded && "invisible", className)} + onLoad={() => { + if (typeof src === "string") setLoadedSrc(src); + onLoad?.(); + }} + onError={() => { + if (typeof src === "string") setErrorSrc(src); + onError?.(); + }} + unoptimized={false} + {...props} + /> )}
); @@ -162,8 +162,8 @@ type ImageZoomProps = PropsWithChildren<{ alt?: string; }>; function isDataOrBlobUrl(src: string | undefined): boolean { - if (!src || typeof src !== "string") return false; - return src.startsWith("data:") || src.startsWith("blob:"); + if (!src || typeof src !== "string") return false; + return src.startsWith("data:") || src.startsWith("blob:"); } function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { const [isMounted, setIsMounted] = useState(false); @@ -216,38 +216,38 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { > {/** biome-ignore lint/performance/noImgElement: */} {isDataOrBlobUrl(src) ? ( - // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img - {alt} { - e.stopPropagation(); - handleClose(); - }} - onKeyDown={(e) => { - if (e.key === "Enter") { - e.stopPropagation(); - handleClose(); - } - }} - /> - ) : ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + {alt} { + e.stopPropagation(); + handleClose(); + }} + onKeyDown={(e) => { + if (e.key === "Enter") { + e.stopPropagation(); + handleClose(); + } + }} + /> + ) : ( { - e.stopPropagation(); - handleClose(); - }} - unoptimized={false} - /> - )} + data-slot="image-zoom-content" + fill + src={src} + alt={alt} + sizes="90vw" + className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200" + onClick={(e) => { + e.stopPropagation(); + handleClose(); + }} + unoptimized={false} + /> + )} , document.body )} diff --git a/surfsense_web/components/assistant-ui/thread-list.tsx b/surfsense_web/components/assistant-ui/thread-list.tsx index e8b8db6fe..bca36c037 100644 --- a/surfsense_web/components/assistant-ui/thread-list.tsx +++ b/surfsense_web/components/assistant-ui/thread-list.tsx @@ -241,9 +241,7 @@ const ThreadListItemComponent = memo(function ThreadListItemComponent({

{thread.title || "New Chat"}

-

- {relativeTime} -

+

{relativeTime}

diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx index 40118d2e4..b658dba6d 100644 --- a/surfsense_web/components/assistant-ui/tool-fallback.tsx +++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx @@ -26,7 +26,8 @@ export const ToolFallback: ToolCallMessagePartComponent = ({ ); const serializedResult = useMemo( - () => (result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null), + () => + result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null, [result] ); diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx index 3d6ea384b..1c4383388 100644 --- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx +++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx @@ -1,6 +1,6 @@ "use client"; -import { ArrowUp, Send, X } from "lucide-react"; +import { ArrowUp } from "lucide-react"; import { useCallback, useEffect, useRef, useState } from "react"; import { Button } from "@/components/ui/button"; import { Popover, PopoverAnchor, PopoverContent } from "@/components/ui/popover"; @@ -307,7 +307,6 @@ export function CommentComposer({ onClick={onCancel} disabled={isSubmitting} > - Cancel )} @@ -318,14 +317,7 @@ export function CommentComposer({ disabled={!canSubmit} className={cn(!canSubmit && "opacity-50", compact && "size-8 shrink-0 rounded-full")} > - {compact ? ( - - ) : ( - <> - - {submitLabel} - - )} + {compact ? : submitLabel} diff --git a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx index 5c0e27779..9638ac01c 100644 --- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx +++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx @@ -1,6 +1,6 @@ "use client"; -import { MoreHorizontal, Pencil, Trash2 } from "lucide-react"; +import { MoreHorizontal, PenLine, Trash2 } from "lucide-react"; import { Button } from "@/components/ui/button"; import { DropdownMenu, @@ -21,15 +21,15 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment {canEdit && ( - + Edit )} diff --git a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx index 4996fe01b..eb374ba49 100644 --- a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx +++ b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx @@ -198,7 +198,7 @@ export function CommentItem({ (null); + const [titleTooltipOpen, setTitleTooltipOpen] = useState(false); const rowRef = useRef(null); + const titleRef = useRef(null); const handleExport = useCallback( (format: string) => { @@ -118,6 +120,14 @@ export const DocumentNode = React.memo(function DocumentNode({ [doc, onExport] ); + const handleTitleTooltipOpenChange = useCallback((open: boolean) => { + if (open && titleRef.current) { + setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth); + } else { + setTitleTooltipOpen(false); + } + }, []); + const attachRef = useCallback( (node: HTMLDivElement | null) => { (rowRef as React.MutableRefObject).current = node; @@ -197,7 +207,20 @@ export const DocumentNode = React.memo(function DocumentNode({ ); })()} - {doc.title} + + + + {doc.title} + + + + {doc.title} + + {getDocumentTypeIcon( doc.document_type as DocumentTypeEnum, @@ -259,11 +282,7 @@ export const DocumentNode = React.memo(function DocumentNode({ Versions )} - onDelete(doc)} - > + onDelete(doc)}> Delete @@ -305,11 +324,7 @@ export const DocumentNode = React.memo(function DocumentNode({ Versions )} - onDelete(doc)} - > + onDelete(doc)}> Delete diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 88cc76c69..2ec430871 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -56,7 +56,6 @@ interface FolderNodeProps { depth: number; isExpanded: boolean; isRenaming: boolean; - childCount: number; selectionState: FolderSelectionState; processingState: "idle" | "processing" | "failed"; onToggleSelect: (folderId: number, selectAll: boolean) => void; @@ -101,7 +100,6 @@ export const FolderNode = React.memo(function FolderNode({ depth, isExpanded, isRenaming, - childCount, selectionState, processingState, onToggleSelect, @@ -336,12 +334,6 @@ export const FolderNode = React.memo(function FolderNode({ {folder.name} )} - {!isRenaming && childCount > 0 && ( - - {childCount} - - )} - {!isRenaming && ( diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 1df007c0b..47cd17596 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -86,16 +86,6 @@ export function FolderTreeView({ const docsByFolder = useMemo(() => groupBy(documents, (d) => d.folderId ?? "root"), [documents]); - const folderChildCounts = useMemo(() => { - const counts: Record = {}; - for (const f of folders) { - const children = foldersByParent[f.id] ?? []; - const docs = docsByFolder[f.id] ?? []; - counts[f.id] = children.length + docs.length; - } - return counts; - }, [folders, foldersByParent, docsByFolder]); - const [openContextMenuId, setOpenContextMenuId] = useState(null); // Single subscription for rename state — derived boolean passed to each FolderNode @@ -106,14 +96,26 @@ export function FolderTreeView({ ); const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]); + const effectiveActiveTypes = useMemo(() => { + if ( + activeTypes.includes("FILE" as DocumentTypeEnum) && + !activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum) + ) { + return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum]; + } + return activeTypes; + }, [activeTypes]); + const hasDescendantMatch = useMemo(() => { - if (activeTypes.length === 0 && !searchQuery) return null; + if (effectiveActiveTypes.length === 0 && !searchQuery) return null; const match: Record = {}; function check(folderId: number): boolean { if (match[folderId] !== undefined) return match[folderId]; const childDocs = (docsByFolder[folderId] ?? []).some( - (d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum) + (d) => + effectiveActiveTypes.length === 0 || + effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum) ); if (childDocs) { match[folderId] = true; @@ -134,7 +136,7 @@ export function FolderTreeView({ check(f.id); } return match; - }, [folders, docsByFolder, foldersByParent, activeTypes, searchQuery]); + }, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]); const folderSelectionStates = useMemo(() => { const states: Record = {}; @@ -204,7 +206,9 @@ export function FolderTreeView({ ? childFolders.filter((f) => hasDescendantMatch[f.id]) : childFolders; const childDocs = (docsByFolder[key] ?? []).filter( - (d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum) + (d) => + effectiveActiveTypes.length === 0 || + effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum) ); const nodes: React.ReactNode[] = []; @@ -226,7 +230,6 @@ export function FolderTreeView({ depth={depth} isExpanded={isExpanded} isRenaming={renamingFolderId === f.id} - childCount={folderChildCounts[f.id] ?? 0} selectionState={folderSelectionStates[f.id] ?? "none"} processingState={folderProcessingStates[f.id] ?? "idle"} onToggleSelect={onToggleFolderSelect} @@ -289,7 +292,7 @@ export function FolderTreeView({ ); } - if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) { + if (treeNodes.length === 0 && (effectiveActiveTypes.length > 0 || searchQuery)) { return (
diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index c307b3cea..7c94356d8 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -11,13 +11,12 @@ import { MarkdownViewer } from "@/components/markdown-viewer"; import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; -import { Skeleton } from "@/components/ui/skeleton"; import { useMediaQuery } from "@/hooks/use-media-query"; import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils"; const PlateEditor = dynamic( () => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })), - { ssr: false, loading: () => } + { ssr: false, loading: () => } ); const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB diff --git a/surfsense_web/components/editor/plate-editor.tsx b/surfsense_web/components/editor/plate-editor.tsx index ed239ffa3..66e9a0e5e 100644 --- a/surfsense_web/components/editor/plate-editor.tsx +++ b/surfsense_web/components/editor/plate-editor.tsx @@ -158,17 +158,18 @@ export function PlateEditor({ // When not forced read-only, the user can toggle between editing/viewing. const canToggleMode = !readOnly; - const contextProviderValue = useMemo(()=> ({ - onSave, - hasUnsavedChanges, - isSaving, - canToggleMode, - }), [onSave, hasUnsavedChanges, isSaving, canToggleMode]); + const contextProviderValue = useMemo( + () => ({ + onSave, + hasUnsavedChanges, + isSaving, + canToggleMode, + }), + [onSave, hasUnsavedChanges, isSaving, canToggleMode] + ); return ( - +
- {title} + {title}
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 380ffa656..7e9c33a1a 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -347,35 +347,38 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid // Navigation items const navItems: NavItem[] = useMemo( - () => [ - { - title: "Inbox", - url: "#inbox", - icon: Inbox, - isActive: isInboxSidebarOpen, - badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined, - }, - { - title: "Documents", - url: "#documents", - icon: SquareLibrary, - isActive: isMobile - ? isDocumentsSidebarOpen - : isDocumentsSidebarOpen && !isRightPanelCollapsed, - }, - { - title: "Announcements", - url: "#announcements", - icon: Megaphone, - isActive: isAnnouncementsSidebarOpen, - badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined, - }, - ], + () => + ( + [ + { + title: "Inbox", + url: "#inbox", + icon: Inbox, + isActive: isInboxSidebarOpen, + badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined, + }, + isMobile + ? { + title: "Documents", + url: "#documents", + icon: SquareLibrary, + isActive: isDocumentsSidebarOpen, + } + : null, + { + title: "Announcements", + url: "#announcements", + icon: Megaphone, + isActive: isAnnouncementsSidebarOpen, + badge: + announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined, + }, + ] as (NavItem | null)[] + ).filter((item): item is NavItem => item !== null), [ isMobile, isInboxSidebarOpen, isDocumentsSidebarOpen, - isRightPanelCollapsed, totalUnreadCount, isAnnouncementsSidebarOpen, announcementUnreadCount, diff --git a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx index e39bee679..d8ec767d7 100644 --- a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx +++ b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx @@ -82,7 +82,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac return ( - +
@@ -107,7 +107,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac placeholder={t("name_placeholder")} {...field} autoFocus - className="text-sm h-9 sm:h-10" + className="text-sm h-9 sm:h-10 select-text" /> @@ -130,7 +130,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx index 717f5a459..febae35d3 100644 --- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx +++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx @@ -10,7 +10,6 @@ import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom"; import { Button } from "@/components/ui/button"; -import { Skeleton } from "@/components/ui/skeleton"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { DocumentsSidebar } from "../sidebar"; @@ -27,7 +26,7 @@ const HitlEditPanelContent = dynamic( import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({ default: m.HitlEditPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); const ReportPanelContent = dynamic( @@ -35,7 +34,7 @@ const ReportPanelContent = dynamic( import("@/components/report-panel/report-panel").then((m) => ({ default: m.ReportPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); interface RightPanelProps { @@ -78,14 +77,14 @@ export function RightPanelExpandButton() { if (!collapsed || !hasContent) return null; return ( -
+
) : ( - + ) : ( - +
-
+
)} - {isMobile ? ( - - ) : ( - - - - - - {t("mark_all_read") || "Mark all as read"} - - - )} + + + + + + {t("mark_all_read") || "Mark all as read"} + +
@@ -932,30 +919,8 @@ export function InboxSidebarContent({ )} style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }} > - {isMobile ? ( - - ) : ( - + {activeTab === "status" ? ( + )}
diff --git a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx index 73347e304..7dd46e484 100644 --- a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx +++ b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx @@ -35,7 +35,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp @@ -48,7 +48,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp {pageBuyingEnabled && ( diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index 1c39f03a0..2acf68e94 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -2,9 +2,9 @@ import { createCodePlugin } from "@streamdown/code"; import { createMathPlugin } from "@streamdown/math"; import { Streamdown, type StreamdownProps } from "streamdown"; import "katex/dist/katex.min.css"; -import { cn } from "@/lib/utils"; -import Image from 'next/image'; import { is } from "drizzle-orm"; +import Image from "next/image"; +import { cn } from "@/lib/utils"; const code = createCodePlugin({ themes: ["nord", "nord"], @@ -130,30 +130,31 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer ), hr: ({ ...props }) =>
, img: ({ src, alt, width: _w, height: _h, ...props }) => { - const isDataOrUnknownUrl = typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http")); + const isDataOrUnknownUrl = + typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http")); - return isDataOrUnknownUrl ? ( - // eslint-disable-next-line @next/next/no-img-element - {alt - ) : ( - {alt - ); -}, + return isDataOrUnknownUrl ? ( + // eslint-disable-next-line @next/next/no-img-element + {alt + ) : ( + {alt + ); + }, table: ({ ...props }) => (
diff --git a/surfsense_web/components/new-chat/chat-share-button.tsx b/surfsense_web/components/new-chat/chat-share-button.tsx index 82e8c6a78..4fc35aba1 100644 --- a/surfsense_web/components/new-chat/chat-share-button.tsx +++ b/surfsense_web/components/new-chat/chat-share-button.tsx @@ -163,21 +163,16 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS )} - - - - - - - Share settings - + + +
- +

No models found

Try a different search term

@@ -531,8 +531,9 @@ export function ModelSelector({ >
- +

No image models found

+

Try a different search term

diff --git a/surfsense_web/components/settings/user-settings-dialog.tsx b/surfsense_web/components/settings/user-settings-dialog.tsx index 919b08174..3295870fd 100644 --- a/surfsense_web/components/settings/user-settings-dialog.tsx +++ b/surfsense_web/components/settings/user-settings-dialog.tsx @@ -6,10 +6,10 @@ import { useTranslations } from "next-intl"; import { useMemo } from "react"; import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent"; import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent"; +import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent"; import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent"; import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent"; import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent"; -import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent"; import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms"; import { SettingsDialog } from "@/components/settings/settings-dialog"; import { usePlatform } from "@/hooks/use-platform"; diff --git a/surfsense_web/components/shared/image-config-dialog.tsx b/surfsense_web/components/shared/image-config-dialog.tsx index 1cfbf8842..2ae53ccca 100644 --- a/surfsense_web/components/shared/image-config-dialog.tsx +++ b/surfsense_web/components/shared/image-config-dialog.tsx @@ -433,7 +433,7 @@ export function ImageConfigDialog({ className="relative text-sm h-9 min-w-[120px]" > - {mode === "edit" ? "Save Changes" : "Create & Use"} + {mode === "edit" ? "Save Changes" : "Add Model"} {isSubmitting && } diff --git a/surfsense_web/components/shared/model-config-dialog.tsx b/surfsense_web/components/shared/model-config-dialog.tsx index 84ba821fc..4d2373b49 100644 --- a/surfsense_web/components/shared/model-config-dialog.tsx +++ b/surfsense_web/components/shared/model-config-dialog.tsx @@ -312,7 +312,7 @@ export function ModelConfigDialog({ className="relative text-sm h-9 min-w-[120px]" > - {mode === "edit" ? "Save Changes" : "Create & Use"} + {mode === "edit" ? "Save Changes" : "Add Model"} {isSubmitting && } diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index cf4019d93..28e160261 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -86,7 +86,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/rtf": [".rtf"], "application/xml": [".xml"], "application/epub+zip": [".epub"], - "text/html": [".html", ".htm", ".web"], "image/gif": [".gif"], "image/svg+xml": [".svg"], ...audioFileTypes, @@ -470,8 +469,9 @@ export function DocumentUploadTab({ )) ) : ( -
{ if (!isElectron) fileInputRef.current?.click(); }} @@ -483,10 +483,16 @@ export function DocumentUploadTab({

{t("file_size_limit")}

-
e.stopPropagation()}> + {/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */} +
e.stopPropagation()} + onKeyDown={(e) => e.stopPropagation()} + role="group" + > {renderBrowseButton({ fullWidth: true })}
-
+ )} @@ -681,9 +687,13 @@ export function DocumentUploadTab({ -
+
{supportedExtensions.map((ext) => ( - + {ext} ))} diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx index 75b02bf3d..bbe869a09 100644 --- a/surfsense_web/components/tool-ui/citation/citation-list.tsx +++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx @@ -2,13 +2,12 @@ import type { LucideIcon } from "lucide-react"; import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react"; +import NextImage from "next/image"; import * as React from "react"; import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media"; import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter"; import { Citation } from "./citation"; import type { CitationType, CitationVariant, SerializableCitation } from "./schema"; -import NextImage from 'next/image'; - const TYPE_ICONS: Record = { webpage: Globe, @@ -264,9 +263,9 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) { className="size-4.5 rounded-full object-cover" unoptimized={true} /> - ) : ( + ) : (