diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py index b177c2f8d..e89800191 100644 --- a/surfsense_backend/app/connectors/dropbox/client.py +++ b/surfsense_backend/app/connectors/dropbox/client.py @@ -225,9 +225,7 @@ class DropboxClient: return all_items, None - async def get_latest_cursor( - self, path: str = "" - ) -> tuple[str | None, str | None]: + async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]: """Get a cursor representing the current state of a folder. Uses /2/files/list_folder/get_latest_cursor so we can later call @@ -251,9 +249,7 @@ class DropboxClient: """ all_entries: list[dict[str, Any]] = [] - resp = await self._request( - "/2/files/list_folder/continue", {"cursor": cursor} - ) + resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor}) if resp.status_code == 401: return [], None, "Dropbox authentication expired (401)" if resp.status_code != 200: @@ -268,7 +264,11 @@ class DropboxClient: "/2/files/list_folder/continue", {"cursor": cursor} ) if resp.status_code != 200: - return all_entries, data.get("cursor"), f"Pagination failed: {resp.status_code}" + return ( + all_entries, + data.get("cursor"), + f"Pagination failed: {resp.status_code}", + ) data = resp.json() all_entries.extend(data.get("entries", [])) diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 025c3831a..83ff32e82 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -100,7 +100,9 @@ async def download_and_extract_content( if error: return None, drive_metadata, error - etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) markdown = await _parse_file_to_markdown(temp_file_path, etl_filename) return markdown, drive_metadata, None @@ -233,7 +235,9 @@ async def download_and_process_file( "." )[-1] - etl_filename = file_name + extension if is_google_workspace_file(mime_type) else file_name + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) logger.info(f"Processing {file_name} with Surfsense's file processor") await process_file_in_background( file_path=temp_file_path, diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index a0041c843..6e7ab3c4c 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -1,6 +1,9 @@ from app.config import config as app_config from app.etl_pipeline.etl_document import EtlRequest, EtlResult -from app.etl_pipeline.exceptions import EtlServiceUnavailableError, EtlUnsupportedFileError +from app.etl_pipeline.exceptions import ( + EtlServiceUnavailableError, + EtlUnsupportedFileError, +) from app.etl_pipeline.file_classifier import FileCategory, classify_file from app.etl_pipeline.parsers.audio import transcribe_audio from app.etl_pipeline.parsers.direct_convert import convert_file_directly @@ -78,9 +81,7 @@ class EtlPipelineService: request.file_path, request.estimated_pages ) else: - raise EtlServiceUnavailableError( - f"Unknown ETL_SERVICE: {etl_service}" - ) + raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") return EtlResult( markdown_content=content, diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py index bc7b4537c..4e690bcdc 100644 --- a/surfsense_backend/app/etl_pipeline/file_classifier.py +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -1,27 +1,96 @@ from enum import Enum from pathlib import PurePosixPath -from app.utils.file_extensions import DOCUMENT_EXTENSIONS, get_document_extensions_for_service +from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, +) PLAINTEXT_EXTENSIONS = frozenset( { - ".md", ".markdown", ".txt", ".text", - ".json", ".jsonl", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", ".xml", - ".css", ".scss", ".less", ".sass", - ".py", ".pyw", ".pyi", ".pyx", - ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", - ".java", ".kt", ".kts", ".scala", ".groovy", - ".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx", - ".cs", ".fs", ".fsx", - ".go", ".rs", ".rb", ".php", ".pl", ".pm", ".lua", ".swift", - ".m", ".mm", ".r", ".jl", - ".sh", ".bash", ".zsh", ".fish", ".bat", ".cmd", ".ps1", - ".sql", ".graphql", ".gql", - ".env", ".gitignore", ".dockerignore", ".editorconfig", - ".makefile", ".cmake", - ".log", ".rst", ".tex", ".bib", ".org", ".adoc", ".asciidoc", - ".vue", ".svelte", ".astro", - ".tf", ".hcl", ".proto", + ".md", + ".markdown", + ".txt", + ".text", + ".json", + ".jsonl", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".xml", + ".css", + ".scss", + ".less", + ".sass", + ".py", + ".pyw", + ".pyi", + ".pyx", + ".js", + ".jsx", + ".ts", + ".tsx", + ".mjs", + ".cjs", + ".java", + ".kt", + ".kts", + ".scala", + ".groovy", + ".c", + ".h", + ".cpp", + ".cxx", + ".cc", + ".hpp", + ".hxx", + ".cs", + ".fs", + ".fsx", + ".go", + ".rs", + ".rb", + ".php", + ".pl", + ".pm", + ".lua", + ".swift", + ".m", + ".mm", + ".r", + ".jl", + ".sh", + ".bash", + ".zsh", + ".fish", + ".bat", + ".cmd", + ".ps1", + ".sql", + ".graphql", + ".gql", + ".env", + ".gitignore", + ".dockerignore", + ".editorconfig", + ".makefile", + ".cmake", + ".log", + ".rst", + ".tex", + ".bib", + ".org", + ".adoc", + ".asciidoc", + ".vue", + ".svelte", + ".astro", + ".tf", + ".hcl", + ".proto", } ) diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py index 5115aebea..ae2a34234 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py +++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py @@ -66,16 +66,12 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str: ) if hasattr(result, "get_markdown_documents"): - markdown_docs = result.get_markdown_documents( - split_by_page=False - ) + markdown_docs = result.get_markdown_documents(split_by_page=False) if markdown_docs and hasattr(markdown_docs[0], "text"): return markdown_docs[0].text if hasattr(result, "pages") and result.pages: return "\n\n".join( - p.md - for p in result.pages - if hasattr(p, "md") and p.md + p.md for p in result.pages if hasattr(p, "md") and p.md ) return str(result) @@ -83,9 +79,7 @@ async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str: if result and hasattr(result[0], "text"): return result[0].text return "\n\n".join( - doc.page_content - if hasattr(doc, "page_content") - else str(doc) + doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in result ) diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py index bb56709cb..a11b7dbc1 100644 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends from fastapi.responses import StreamingResponse from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession @@ -31,8 +31,11 @@ async def vision_autocomplete_stream( return StreamingResponse( stream_vision_autocomplete( - body.screenshot, body.search_space_id, session, - app_name=body.app_name, window_title=body.window_title, + body.screenshot, + body.search_space_id, + session, + app_name=body.app_name, + window_title=body.window_title, ), media_type="text/event-stream", headers={ diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index a30eb7297..bb20da65d 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -2647,7 +2647,12 @@ async def run_onedrive_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message, total_unsupported = await index_onedrive_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_onedrive_files( session, connector_id, search_space_id, @@ -2756,7 +2761,12 @@ async def run_dropbox_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message, total_unsupported = await index_dropbox_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_dropbox_files( session, connector_id, search_space_id, diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py index f24a5c848..7e9408be7 100644 --- a/surfsense_backend/app/services/vision_autocomplete_service.py +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -1,5 +1,5 @@ import logging -from typing import AsyncGenerator +from collections.abc import AsyncGenerator from langchain_core.messages import HumanMessage, SystemMessage from sqlalchemy.ext.asyncio import AsyncSession @@ -68,8 +68,10 @@ def _is_vision_unsupported_error(e: Exception) -> bool: async def _extract_query_from_screenshot( - llm, screenshot_data_url: str, - app_name: str = "", window_title: str = "", + llm, + screenshot_data_url: str, + app_name: str = "", + window_title: str = "", ) -> str | None: """Ask the Vision LLM to describe what the user is working on. @@ -78,18 +80,26 @@ async def _extract_query_from_screenshot( """ if app_name: prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format( - app_name=app_name, window_title=window_title, + app_name=app_name, + window_title=window_title, ) else: prompt_text = EXTRACT_QUERY_PROMPT try: - response = await llm.ainvoke([ - HumanMessage(content=[ - {"type": "text", "text": prompt_text}, - {"type": "image_url", "image_url": {"url": screenshot_data_url}}, - ]), - ]) + response = await llm.ainvoke( + [ + HumanMessage( + content=[ + {"type": "text", "text": prompt_text}, + { + "type": "image_url", + "image_url": {"url": screenshot_data_url}, + }, + ] + ), + ] + ) query = response.content.strip() if hasattr(response, "content") else "" return query if query else None except Exception as e: @@ -167,10 +177,15 @@ async def stream_vision_autocomplete( kb_context = "" try: query = await _extract_query_from_screenshot( - llm, screenshot_data_url, app_name=app_name, window_title=window_title, + llm, + screenshot_data_url, + app_name=app_name, + window_title=window_title, ) except Exception as e: - logger.warning(f"Vision autocomplete: selected model does not support vision: {e}") + logger.warning( + f"Vision autocomplete: selected model does not support vision: {e}" + ) yield streaming.format_message_start() yield streaming.format_error(vision_error_msg) yield streaming.format_done() @@ -183,16 +198,18 @@ async def stream_vision_autocomplete( messages = [ SystemMessage(content=system_prompt), - HumanMessage(content=[ - { - "type": "text", - "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.", - }, - { - "type": "image_url", - "image_url": {"url": screenshot_data_url}, - }, - ]), + HumanMessage( + content=[ + { + "type": "text", + "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.", + }, + { + "type": "image_url", + "image_url": {"url": screenshot_data_url}, + }, + ] + ), ] text_started = False @@ -217,7 +234,9 @@ async def stream_vision_autocomplete( yield streaming.format_text_end(text_id) if _is_vision_unsupported_error(e): - logger.warning(f"Vision autocomplete: selected model does not support vision: {e}") + logger.warning( + f"Vision autocomplete: selected model does not support vision: {e}" + ) yield streaming.format_error(vision_error_msg) else: logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True) diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py index 8d2a45e03..4a49944c2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py @@ -254,9 +254,7 @@ async def _download_and_index( return batch_indexed, download_failed + batch_failed -async def _remove_document( - session: AsyncSession, file_id: str, search_space_id: int -): +async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): """Remove a document that was deleted in Dropbox.""" primary_hash = compute_identifier_hash( DocumentType.DROPBOX_FILE.value, file_id, search_space_id @@ -268,8 +266,7 @@ async def _remove_document( select(Document).where( Document.search_space_id == search_space_id, Document.document_type == DocumentType.DROPBOX_FILE, - cast(Document.document_metadata["dropbox_file_id"], String) - == file_id, + cast(Document.document_metadata["dropbox_file_id"], String) == file_id, ) ) existing = result.scalar_one_or_none() @@ -671,9 +668,7 @@ async def index_dropbox_files( saved_cursor = folder_cursors.get(folder_path) can_use_delta = ( - use_delta_sync - and saved_cursor - and connector.last_indexed_at + use_delta_sync and saved_cursor and connector.last_indexed_at ) if can_use_delta: @@ -739,7 +734,11 @@ async def index_dropbox_files( await task_logger.log_task_success( log_entry, f"Successfully completed Dropbox indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( f"Dropbox indexing completed: {total_indexed} indexed, " diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 9916e70a0..b11087fe6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -1010,7 +1010,11 @@ async def index_google_drive_files( documents_unsupported += ru else: logger.info(f"Using full scan for connector {connector_id}") - documents_indexed, documents_skipped, documents_unsupported = await _index_full_scan( + ( + documents_indexed, + documents_skipped, + documents_unsupported, + ) = await _index_full_scan( drive_client, session, connector, @@ -1301,7 +1305,12 @@ async def index_google_drive_selected_files( log_entry, f"Batch file indexing completed with {len(errors)} error(s)", "; ".join(errors), - {"indexed": indexed, "skipped": skipped, "unsupported": unsupported, "error_count": len(errors)}, + { + "indexed": indexed, + "skipped": skipped, + "unsupported": unsupported, + "error_count": len(errors), + }, ) else: await task_logger.log_task_success( diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index f4366fb78..7f42f4638 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -23,7 +23,6 @@ from sqlalchemy import select from sqlalchemy.exc import IntegrityError, SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import ( Document, DocumentStatus, @@ -153,8 +152,6 @@ def scan_folder( return files - - async def _read_file_content(file_path: str, filename: str) -> str: """Read file content via the unified ETL pipeline. diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py index b26442490..06517f542 100644 --- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py @@ -762,7 +762,11 @@ async def index_onedrive_files( await task_logger.log_task_success( log_entry, f"Successfully completed OneDrive indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped, "files_unsupported": total_unsupported}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( f"OneDrive indexing completed: {total_indexed} indexed, " diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index a9a6b62be..c765dbd87 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -292,8 +292,10 @@ async def process_file_in_background( ) try: - from app.etl_pipeline.file_classifier import FileCategory as EtlFileCategory - from app.etl_pipeline.file_classifier import classify_file as etl_classify + from app.etl_pipeline.file_classifier import ( + FileCategory as EtlFileCategory, + classify_file as etl_classify, + ) category = etl_classify(filename) @@ -345,8 +347,10 @@ async def _extract_file_content( """ from app.etl_pipeline.etl_document import EtlRequest from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - from app.etl_pipeline.file_classifier import FileCategory - from app.etl_pipeline.file_classifier import classify_file as etl_classify + from app.etl_pipeline.file_classifier import ( + FileCategory, + classify_file as etl_classify, + ) category = etl_classify(filename) estimated_pages = 0 diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index 5eed36872..8d432ce56 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -15,30 +15,83 @@ from pathlib import PurePosixPath # Per-parser document extension sets (from official documentation) # --------------------------------------------------------------------------- -DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ - ".pdf", - ".docx", ".xlsx", ".pptx", - ".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".webp", -}) +DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".tiff", + ".tif", + ".bmp", + ".webp", + } +) -LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ - ".pdf", - ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", - ".docm", ".dot", ".dotm", ".pptm", ".pot", ".potx", - ".xlsm", ".xlsb", ".xlw", - ".rtf", ".epub", - ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".tif", ".webp", ".svg", - ".odt", ".ods", ".odp", - ".hwp", ".hwpx", -}) +LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".docm", + ".dot", + ".dotm", + ".pptm", + ".pot", + ".potx", + ".xlsm", + ".xlsb", + ".xlw", + ".rtf", + ".epub", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".odt", + ".ods", + ".odp", + ".hwp", + ".hwpx", + } +) -UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset({ - ".pdf", - ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", - ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif", ".heic", - ".rtf", ".epub", ".odt", - ".eml", ".msg", ".p7s", -}) +UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".png", + ".jpg", + ".jpeg", + ".bmp", + ".tiff", + ".tif", + ".heic", + ".rtf", + ".epub", + ".odt", + ".eml", + ".msg", + ".p7s", + } +) # --------------------------------------------------------------------------- # Union (used by classify_file for routing) + service lookup diff --git a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py index 49f9a217a..cd112e09f 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py @@ -6,7 +6,6 @@ real so we know the full path from "cloud gives us bytes" to "we get markdown back" actually works. """ -import os from unittest.mock import AsyncMock, MagicMock import pytest @@ -21,6 +20,7 @@ _CSV_CONTENT = "name,age\nAlice,30\nBob,25\n" # Helpers # --------------------------------------------------------------------------- + async def _write_file(dest_path: str, content: str) -> None: """Simulate a cloud client writing downloaded bytes to disk.""" with open(dest_path, "w", encoding="utf-8") as f: @@ -43,8 +43,8 @@ def _make_download_side_effect(content: str): # Google Drive # =================================================================== -class TestGoogleDriveContentExtraction: +class TestGoogleDriveContentExtraction: async def test_txt_file_returns_markdown(self): from app.connectors.google_drive.content_extractor import ( download_and_extract_content, @@ -76,7 +76,7 @@ class TestGoogleDriveContentExtraction: file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"} - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert error is None assert "Alice" in markdown @@ -93,7 +93,7 @@ class TestGoogleDriveContentExtraction: file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"} - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert markdown is None assert error == "Network timeout" @@ -103,8 +103,8 @@ class TestGoogleDriveContentExtraction: # OneDrive # =================================================================== -class TestOneDriveContentExtraction: +class TestOneDriveContentExtraction: async def test_txt_file_returns_markdown(self): from app.connectors.onedrive.content_extractor import ( download_and_extract_content, @@ -144,7 +144,7 @@ class TestOneDriveContentExtraction: "file": {"mimeType": "text/csv"}, } - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert error is None assert "Alice" in markdown @@ -164,7 +164,7 @@ class TestOneDriveContentExtraction: "file": {"mimeType": "text/plain"}, } - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert markdown is None assert error == "403 Forbidden" @@ -174,8 +174,8 @@ class TestOneDriveContentExtraction: # Dropbox # =================================================================== -class TestDropboxContentExtraction: +class TestDropboxContentExtraction: async def test_txt_file_returns_markdown(self): from app.connectors.dropbox.content_extractor import ( download_and_extract_content, @@ -217,7 +217,7 @@ class TestDropboxContentExtraction: "path_lower": "/data.csv", } - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert error is None assert "Alice" in markdown @@ -238,7 +238,7 @@ class TestDropboxContentExtraction: "path_lower": "/big.txt", } - markdown, metadata, error = await download_and_extract_content(client, file) + markdown, _metadata, error = await download_and_extract_content(client, file) assert markdown is None assert error == "Rate limited" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index adac90085..f72135d05 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -265,6 +265,7 @@ def full_scan_mocks(mock_dropbox_client, monkeypatch): async def _fake_skip(session, file, search_space_id): from app.connectors.dropbox.file_types import should_skip_file as _skip + item_skip, unsup_ext = _skip(file) if item_skip: if unsup_ext: @@ -468,7 +469,11 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks): indexed, skipped, _unsupported, errors = await _run_selected( selected_files_mocks, - [("/first.txt", "first.txt"), ("/mid.txt", "mid.txt"), ("/third.txt", "third.txt")], + [ + ("/first.txt", "first.txt"), + ("/mid.txt", "mid.txt"), + ("/third.txt", "third.txt"), + ], ) assert indexed == 2 @@ -526,8 +531,18 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch): import app.tasks.connector_indexers.dropbox_indexer as _mod entries = [ - {".tag": "deleted", "name": "gone.txt", "path_lower": "/gone.txt", "id": "id:del1"}, - {".tag": "deleted", "name": "also_gone.pdf", "path_lower": "/also_gone.pdf", "id": "id:del2"}, + { + ".tag": "deleted", + "name": "gone.txt", + "path_lower": "/gone.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "also_gone.pdf", + "path_lower": "/also_gone.pdf", + "id": "id:del2", + }, ] mock_client = MagicMock() @@ -544,7 +559,7 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped, unsupported, cursor = await _index_with_delta_sync( + _indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync( mock_client, AsyncMock(), _CONNECTOR_ID, @@ -573,7 +588,9 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch): mock_client = MagicMock() mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None)) - monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None))) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) download_mock = AsyncMock(return_value=(2, 0)) monkeypatch.setattr(_mod, "_download_and_index", download_mock) @@ -581,7 +598,7 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped, unsupported, cursor = await _index_with_delta_sync( + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( mock_client, AsyncMock(), _CONNECTOR_ID, @@ -608,8 +625,18 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): import app.tasks.connector_indexers.dropbox_indexer as _mod entries = [ - {".tag": "deleted", "name": "removed.txt", "path_lower": "/removed.txt", "id": "id:del1"}, - {".tag": "deleted", "name": "trashed.pdf", "path_lower": "/trashed.pdf", "id": "id:del2"}, + { + ".tag": "deleted", + "name": "removed.txt", + "path_lower": "/removed.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "trashed.pdf", + "path_lower": "/trashed.pdf", + "id": "id:del2", + }, _make_file_dict("mod1", "updated.txt"), _make_file_dict("new1", "brandnew.docx"), ] @@ -623,7 +650,9 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): remove_calls.append(file_id) monkeypatch.setattr(_mod, "_remove_document", _fake_remove) - monkeypatch.setattr(_mod, "_should_skip_file", AsyncMock(return_value=(False, None))) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) download_mock = AsyncMock(return_value=(2, 0)) monkeypatch.setattr(_mod, "_download_and_index", download_mock) @@ -631,7 +660,7 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped, unsupported, cursor = await _index_with_delta_sync( + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( mock_client, AsyncMock(), _CONNECTOR_ID, @@ -665,7 +694,7 @@ async def test_delta_sync_returns_new_cursor(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped, unsupported, cursor = await _index_with_delta_sync( + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( mock_client, AsyncMock(), _CONNECTOR_ID, @@ -723,9 +752,7 @@ def orchestrator_mocks(monkeypatch): mock_client = MagicMock() mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None)) - monkeypatch.setattr( - _mod, "DropboxClient", MagicMock(return_value=mock_client) - ) + monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client)) return { "connector": mock_connector, @@ -751,7 +778,7 @@ async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed( mock_session = AsyncMock() mock_session.commit = AsyncMock() - indexed, skipped, error, _unsupported = await index_dropbox_files( + _indexed, _skipped, error, _unsupported = await index_dropbox_files( mock_session, _CONNECTOR_ID, _SEARCH_SPACE_ID, @@ -779,7 +806,7 @@ async def test_orchestrator_falls_back_to_full_scan_without_cursor( mock_session = AsyncMock() mock_session.commit = AsyncMock() - indexed, skipped, error, _unsupported = await index_dropbox_files( + _indexed, _skipped, error, _unsupported = await index_dropbox_files( mock_session, _CONNECTOR_ID, _SEARCH_SPACE_ID, diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py index 7fa92ce12..0ae096361 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py @@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch): full_scan_mocks["download_mock"].return_value = (mock_docs, 0) full_scan_mocks["batch_mock"].return_value = ([], 2, 0) - indexed, skipped, unsupported = await _run_full_scan(full_scan_mocks) + indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks) assert indexed == 3 # 1 renamed + 2 from batch assert skipped == 1 # 1 unchanged @@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch): mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped, unsupported = await _index_with_delta_sync( + indexed, skipped, _unsupported = await _index_with_delta_sync( MagicMock(), mock_session, MagicMock(), @@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (1, 0) - indexed, skipped, unsup, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "report.pdf")], ) @@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, unsup, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")], ) @@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks): selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, unsup, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [ ("s1", "unchanged.txt"), diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py index 58737b20b..573ee43d8 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py @@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks): None, ) - indexed, _skipped, _unsup, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")]) + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("big", "huge.pdf")] + ) assert indexed == 0 assert len(errors) == 1 @@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks): None, ) - indexed, _skipped, _unsup, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")]) + indexed, _skipped, _unsup, errors = await _run_onedrive_selected( + m, [("big", "huge.pdf")] + ) assert indexed == 0 assert len(errors) == 1 diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py index efacbcf72..31cafe550 100644 --- a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py @@ -19,6 +19,7 @@ def _make_client() -> DropboxClient: # ---------- C1: get_latest_cursor ---------- + async def test_get_latest_cursor_returns_cursor_string(monkeypatch): client = _make_client() @@ -34,12 +35,17 @@ async def test_get_latest_cursor_returns_cursor_string(monkeypatch): assert error is None client._request.assert_called_once_with( "/2/files/list_folder/get_latest_cursor", - {"path": "/my-folder", "recursive": False, "include_non_downloadable_files": True}, + { + "path": "/my-folder", + "recursive": False, + "include_non_downloadable_files": True, + }, ) # ---------- C2: get_changes returns entries and new cursor ---------- + async def test_get_changes_returns_entries_and_cursor(monkeypatch): client = _make_client() @@ -66,6 +72,7 @@ async def test_get_changes_returns_entries_and_cursor(monkeypatch): # ---------- C3: get_changes handles pagination ---------- + async def test_get_changes_handles_pagination(monkeypatch): client = _make_client() @@ -98,6 +105,7 @@ async def test_get_changes_handles_pagination(monkeypatch): # ---------- C4: get_changes raises on 401 ---------- + async def test_get_changes_returns_error_on_401(monkeypatch): client = _make_client() diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py index 74277d47c..b4715e083 100644 --- a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py @@ -41,15 +41,40 @@ def test_non_downloadable_item_is_skipped(): @pytest.mark.parametrize( "filename", [ - "archive.zip", "backup.tar", "data.gz", "stuff.rar", "pack.7z", - "program.exe", "lib.dll", "module.so", "image.dmg", "disk.iso", - "movie.mov", "clip.avi", "video.mkv", "film.wmv", "stream.flv", + "archive.zip", + "backup.tar", + "data.gz", + "stuff.rar", + "pack.7z", + "program.exe", + "lib.dll", + "module.so", + "image.dmg", + "disk.iso", + "movie.mov", + "clip.avi", + "video.mkv", + "film.wmv", + "stream.flv", "favicon.ico", - "raw.cr2", "photo.nef", "image.arw", "pic.dng", - "design.psd", "vector.ai", "mockup.sketch", "proto.fig", - "font.ttf", "font.otf", "font.woff", "font.woff2", - "model.stl", "scene.fbx", "mesh.blend", - "local.db", "data.sqlite", "access.mdb", + "raw.cr2", + "photo.nef", + "image.arw", + "pic.dng", + "design.psd", + "vector.ai", + "mockup.sketch", + "proto.fig", + "font.ttf", + "font.otf", + "font.woff", + "font.woff2", + "model.stl", + "scene.fbx", + "mesh.blend", + "local.db", + "data.sqlite", + "access.mdb", ], ) def test_non_parseable_extensions_are_skipped(filename, mocker): @@ -63,9 +88,16 @@ def test_non_parseable_extensions_are_skipped(filename, mocker): @pytest.mark.parametrize( "filename", [ - "report.pdf", "document.docx", "sheet.xlsx", "slides.pptx", - "readme.txt", "data.csv", "page.html", "notes.md", - "config.json", "feed.xml", + "report.pdf", + "document.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "page.html", + "notes.md", + "config.json", + "feed.xml", ], ) def test_parseable_documents_are_not_skipped(filename, mocker): @@ -92,30 +124,33 @@ def test_universal_images_are_not_skipped(filename, mocker): assert ext is None -@pytest.mark.parametrize("filename,service,expected_skip", [ - ("old.doc", "DOCLING", True), - ("old.doc", "LLAMACLOUD", False), - ("old.doc", "UNSTRUCTURED", False), - ("legacy.xls", "DOCLING", True), - ("legacy.xls", "LLAMACLOUD", False), - ("legacy.xls", "UNSTRUCTURED", False), - ("deck.ppt", "DOCLING", True), - ("deck.ppt", "LLAMACLOUD", False), - ("deck.ppt", "UNSTRUCTURED", False), - ("icon.svg", "DOCLING", True), - ("icon.svg", "LLAMACLOUD", False), - ("anim.gif", "DOCLING", True), - ("anim.gif", "LLAMACLOUD", False), - ("photo.webp", "DOCLING", False), - ("photo.webp", "LLAMACLOUD", False), - ("photo.webp", "UNSTRUCTURED", True), - ("live.heic", "DOCLING", True), - ("live.heic", "UNSTRUCTURED", False), - ("macro.docm", "DOCLING", True), - ("macro.docm", "LLAMACLOUD", False), - ("mail.eml", "DOCLING", True), - ("mail.eml", "UNSTRUCTURED", False), -]) +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("old.doc", "DOCLING", True), + ("old.doc", "LLAMACLOUD", False), + ("old.doc", "UNSTRUCTURED", False), + ("legacy.xls", "DOCLING", True), + ("legacy.xls", "LLAMACLOUD", False), + ("legacy.xls", "UNSTRUCTURED", False), + ("deck.ppt", "DOCLING", True), + ("deck.ppt", "LLAMACLOUD", False), + ("deck.ppt", "UNSTRUCTURED", False), + ("icon.svg", "DOCLING", True), + ("icon.svg", "LLAMACLOUD", False), + ("anim.gif", "DOCLING", True), + ("anim.gif", "LLAMACLOUD", False), + ("photo.webp", "DOCLING", False), + ("photo.webp", "LLAMACLOUD", False), + ("photo.webp", "UNSTRUCTURED", True), + ("live.heic", "DOCLING", True), + ("live.heic", "UNSTRUCTURED", False), + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ], +) def test_parser_specific_extensions(filename, service, expected_skip, mocker): mocker.patch("app.config.config.ETL_SERVICE", service) item = {".tag": "file", "name": filename} diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py index 5cd43736b..ab602468d 100644 --- a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py @@ -7,21 +7,37 @@ from app.connectors.google_drive.file_types import should_skip_by_extension pytestmark = pytest.mark.unit -@pytest.mark.parametrize("filename", [ - "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", -]) +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker): """Truly unsupported files are skipped no matter which ETL service is configured.""" for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): mocker.patch("app.config.config.ETL_SERVICE", service) - skip, ext = should_skip_by_extension(filename) + skip, _ext = should_skip_by_extension(filename) assert skip is True -@pytest.mark.parametrize("filename", [ - "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", - "readme.txt", "data.csv", "photo.png", "notes.md", -]) +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) def test_universal_extensions_are_not_skipped(filename, mocker): """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped.""" for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): @@ -31,16 +47,19 @@ def test_universal_extensions_are_not_skipped(filename, mocker): assert ext is None -@pytest.mark.parametrize("filename,service,expected_skip", [ - ("macro.docm", "DOCLING", True), - ("macro.docm", "LLAMACLOUD", False), - ("mail.eml", "DOCLING", True), - ("mail.eml", "UNSTRUCTURED", False), - ("photo.gif", "DOCLING", True), - ("photo.gif", "LLAMACLOUD", False), - ("photo.heic", "UNSTRUCTURED", False), - ("photo.heic", "DOCLING", True), -]) +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.gif", "DOCLING", True), + ("photo.gif", "LLAMACLOUD", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) def test_parser_specific_extensions(filename, service, expected_skip, mocker): mocker.patch("app.config.config.ETL_SERVICE", service) skip, ext = should_skip_by_extension(filename) diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py index 61212b340..1d9124c47 100644 --- a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py +++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py @@ -45,9 +45,16 @@ def test_onenote_is_skipped(): # --------------------------------------------------------------------------- -@pytest.mark.parametrize("filename", [ - "malware.exe", "archive.zip", "video.mov", "font.woff2", "model.blend", -]) +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) def test_unsupported_extensions_are_skipped(filename, mocker): mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} @@ -56,10 +63,19 @@ def test_unsupported_extensions_are_skipped(filename, mocker): assert ext is not None -@pytest.mark.parametrize("filename", [ - "report.pdf", "doc.docx", "sheet.xlsx", "slides.pptx", - "readme.txt", "data.csv", "photo.png", "notes.md", -]) +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) def test_universal_files_are_not_skipped(filename, mocker): for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): mocker.patch("app.config.config.ETL_SERVICE", service) @@ -69,14 +85,17 @@ def test_universal_files_are_not_skipped(filename, mocker): assert ext is None -@pytest.mark.parametrize("filename,service,expected_skip", [ - ("macro.docm", "DOCLING", True), - ("macro.docm", "LLAMACLOUD", False), - ("mail.eml", "DOCLING", True), - ("mail.eml", "UNSTRUCTURED", False), - ("photo.heic", "UNSTRUCTURED", False), - ("photo.heic", "DOCLING", True), -]) +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) def test_parser_specific_extensions(filename, service, expected_skip, mocker): mocker.patch("app.config.config.ETL_SERVICE", service) item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} diff --git a/surfsense_backend/tests/unit/etl_pipeline/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/conftest.py index 6059caa01..082ab9771 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/conftest.py +++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py @@ -24,6 +24,4 @@ def _stub_package(dotted: str, fs_dir: Path) -> None: _stub_package("app", _BACKEND / "app") _stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline") -_stub_package( - "app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers" -) +_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers") diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index e90847e3a..769b1dc53 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -144,7 +144,7 @@ async def test_extract_mp3_returns_transcription(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 7 – DOCLING document parsing +# Slice 7 - DOCLING document parsing # --------------------------------------------------------------------------- @@ -172,7 +172,7 @@ async def test_extract_pdf_with_docling(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 8 – UNSTRUCTURED document parsing +# Slice 8 - UNSTRUCTURED document parsing # --------------------------------------------------------------------------- @@ -208,7 +208,7 @@ async def test_extract_pdf_with_unstructured(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 9 – LLAMACLOUD document parsing +# Slice 9 - LLAMACLOUD document parsing # --------------------------------------------------------------------------- @@ -241,9 +241,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker): ) result = await EtlPipelineService().extract( - EtlRequest( - file_path=str(pdf_file), filename="report.pdf", estimated_pages=5 - ) + EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5) ) assert result.markdown_content == "# LlamaCloud parsed" @@ -252,7 +250,7 @@ async def test_extract_pdf_with_llamacloud(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 10 – unknown extension falls through to document ETL +# Slice 10 - unknown extension falls through to document ETL # --------------------------------------------------------------------------- @@ -279,18 +277,18 @@ async def test_unknown_extension_uses_document_etl(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 11 – EtlRequest validation +# Slice 11 - EtlRequest validation # --------------------------------------------------------------------------- def test_etl_request_requires_filename(): """EtlRequest rejects missing filename.""" - with pytest.raises(Exception): + with pytest.raises(ValueError, match="filename must not be empty"): EtlRequest(file_path="/tmp/some.txt", filename="") # --------------------------------------------------------------------------- -# Slice 12 – unknown ETL_SERVICE raises EtlServiceUnavailableError +# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError # --------------------------------------------------------------------------- @@ -310,7 +308,7 @@ async def test_unknown_etl_service_raises(tmp_path, mocker): # --------------------------------------------------------------------------- -# Slice 13 – unsupported file types are rejected before reaching any parser +# Slice 13 - unsupported file types are rejected before reaching any parser # --------------------------------------------------------------------------- @@ -321,10 +319,19 @@ def test_unknown_extension_classified_as_unsupported(): assert classify_file("random.xyz") == FileCategory.UNSUPPORTED -@pytest.mark.parametrize("filename", [ - "malware.exe", "archive.zip", "video.mov", "font.woff2", - "model.blend", "data.parquet", "package.deb", "firmware.bin", -]) +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "data.parquet", + "package.deb", + "firmware.bin", + ], +) def test_unsupported_extensions_classified_correctly(filename): """Extensions not in any allowlist are classified as UNSUPPORTED.""" from app.etl_pipeline.file_classifier import FileCategory, classify_file @@ -332,18 +339,21 @@ def test_unsupported_extensions_classified_correctly(filename): assert classify_file(filename) == FileCategory.UNSUPPORTED -@pytest.mark.parametrize("filename,expected", [ - ("report.pdf", "document"), - ("doc.docx", "document"), - ("slides.pptx", "document"), - ("sheet.xlsx", "document"), - ("photo.png", "document"), - ("photo.jpg", "document"), - ("book.epub", "document"), - ("letter.odt", "document"), - ("readme.md", "plaintext"), - ("data.csv", "direct_convert"), -]) +@pytest.mark.parametrize( + "filename,expected", + [ + ("report.pdf", "document"), + ("doc.docx", "document"), + ("slides.pptx", "document"), + ("sheet.xlsx", "document"), + ("photo.png", "document"), + ("photo.jpg", "document"), + ("book.epub", "document"), + ("letter.odt", "document"), + ("readme.md", "plaintext"), + ("data.csv", "direct_convert"), + ], +) def test_parseable_extensions_classified_correctly(filename, expected): """Parseable files are classified into their correct category.""" from app.etl_pipeline.file_classifier import FileCategory, classify_file @@ -380,31 +390,34 @@ async def test_extract_zip_raises_unsupported_error(tmp_path): # --------------------------------------------------------------------------- -# Slice 14 – should_skip_for_service (per-parser document filtering) +# Slice 14 - should_skip_for_service (per-parser document filtering) # --------------------------------------------------------------------------- -@pytest.mark.parametrize("filename,etl_service,expected_skip", [ - ("file.eml", "DOCLING", True), - ("file.eml", "UNSTRUCTURED", False), - ("file.docm", "LLAMACLOUD", False), - ("file.docm", "DOCLING", True), - ("file.txt", "DOCLING", False), - ("file.csv", "LLAMACLOUD", False), - ("file.mp3", "UNSTRUCTURED", False), - ("file.exe", "LLAMACLOUD", True), - ("file.pdf", "DOCLING", False), - ("file.webp", "DOCLING", False), - ("file.webp", "UNSTRUCTURED", True), - ("file.gif", "LLAMACLOUD", False), - ("file.gif", "DOCLING", True), - ("file.heic", "UNSTRUCTURED", False), - ("file.heic", "DOCLING", True), - ("file.svg", "LLAMACLOUD", False), - ("file.svg", "DOCLING", True), - ("file.p7s", "UNSTRUCTURED", False), - ("file.p7s", "LLAMACLOUD", True), -]) +@pytest.mark.parametrize( + "filename,etl_service,expected_skip", + [ + ("file.eml", "DOCLING", True), + ("file.eml", "UNSTRUCTURED", False), + ("file.docm", "LLAMACLOUD", False), + ("file.docm", "DOCLING", True), + ("file.txt", "DOCLING", False), + ("file.csv", "LLAMACLOUD", False), + ("file.mp3", "UNSTRUCTURED", False), + ("file.exe", "LLAMACLOUD", True), + ("file.pdf", "DOCLING", False), + ("file.webp", "DOCLING", False), + ("file.webp", "UNSTRUCTURED", True), + ("file.gif", "LLAMACLOUD", False), + ("file.gif", "DOCLING", True), + ("file.heic", "UNSTRUCTURED", False), + ("file.heic", "DOCLING", True), + ("file.svg", "LLAMACLOUD", False), + ("file.svg", "DOCLING", True), + ("file.p7s", "UNSTRUCTURED", False), + ("file.p7s", "LLAMACLOUD", True), + ], +) def test_should_skip_for_service(filename, etl_service, expected_skip): from app.etl_pipeline.file_classifier import should_skip_for_service @@ -414,7 +427,7 @@ def test_should_skip_for_service(filename, etl_service, expected_skip): # --------------------------------------------------------------------------- -# Slice 14b – ETL pipeline rejects per-parser incompatible documents +# Slice 14b - ETL pipeline rejects per-parser incompatible documents # --------------------------------------------------------------------------- diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py index 430adbaf2..11ffc0ed1 100644 --- a/surfsense_backend/tests/unit/services/test_docling_image_support.py +++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py @@ -30,26 +30,29 @@ def test_docling_service_does_not_restrict_allowed_formats(): fake_pdf_format_option_cls = MagicMock() - with patch.dict("sys.modules", { - "docling": MagicMock(), - "docling.backend": MagicMock(), - "docling.backend.pypdfium2_backend": MagicMock( - PyPdfiumDocumentBackend=mock_backend - ), - "docling.datamodel": MagicMock(), - "docling.datamodel.base_models": MagicMock( - InputFormat=_FakeInputFormat - ), - "docling.datamodel.pipeline_options": MagicMock( - PdfPipelineOptions=fake_pipeline_options_cls - ), - "docling.document_converter": MagicMock( - DocumentConverter=mock_converter_cls, - PdfFormatOption=fake_pdf_format_option_cls, - ), - }): - import app.services.docling_service as mod + with patch.dict( + "sys.modules", + { + "docling": MagicMock(), + "docling.backend": MagicMock(), + "docling.backend.pypdfium2_backend": MagicMock( + PyPdfiumDocumentBackend=mock_backend + ), + "docling.datamodel": MagicMock(), + "docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat), + "docling.datamodel.pipeline_options": MagicMock( + PdfPipelineOptions=fake_pipeline_options_cls + ), + "docling.document_converter": MagicMock( + DocumentConverter=mock_converter_cls, + PdfFormatOption=fake_pdf_format_option_cls, + ), + }, + ): from importlib import reload + + import app.services.docling_service as mod + reload(mod) mod.DoclingService() diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py index acd8945ce..c33b39f05 100644 --- a/surfsense_backend/tests/unit/utils/test_file_extensions.py +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -17,36 +17,74 @@ def test_exe_is_not_supported_document(): assert is_supported_document_extension("malware.exe") is False -@pytest.mark.parametrize("filename", [ - "report.pdf", "doc.docx", "old.doc", - "sheet.xlsx", "legacy.xls", - "slides.pptx", "deck.ppt", - "macro.docm", "macro.xlsm", "macro.pptm", - "photo.png", "photo.jpg", "photo.jpeg", "scan.bmp", "scan.tiff", "scan.tif", - "photo.webp", "anim.gif", "iphone.heic", - "manual.rtf", "book.epub", - "letter.odt", "data.ods", "presentation.odp", - "inbox.eml", "outlook.msg", - "korean.hwpx", "korean.hwp", - "template.dot", "template.dotm", - "template.pot", "template.potx", - "binary.xlsb", "workspace.xlw", - "vector.svg", "signature.p7s", -]) +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "old.doc", + "sheet.xlsx", + "legacy.xls", + "slides.pptx", + "deck.ppt", + "macro.docm", + "macro.xlsm", + "macro.pptm", + "photo.png", + "photo.jpg", + "photo.jpeg", + "scan.bmp", + "scan.tiff", + "scan.tif", + "photo.webp", + "anim.gif", + "iphone.heic", + "manual.rtf", + "book.epub", + "letter.odt", + "data.ods", + "presentation.odp", + "inbox.eml", + "outlook.msg", + "korean.hwpx", + "korean.hwp", + "template.dot", + "template.dotm", + "template.pot", + "template.potx", + "binary.xlsb", + "workspace.xlw", + "vector.svg", + "signature.p7s", + ], +) def test_document_extensions_are_supported(filename): from app.utils.file_extensions import is_supported_document_extension - assert is_supported_document_extension(filename) is True, f"{filename} should be supported" + assert is_supported_document_extension(filename) is True, ( + f"{filename} should be supported" + ) -@pytest.mark.parametrize("filename", [ - "malware.exe", "archive.zip", "video.mov", "font.woff2", - "model.blend", "random.xyz", "data.parquet", "package.deb", -]) +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "random.xyz", + "data.parquet", + "package.deb", + ], +) def test_non_document_extensions_are_not_supported(filename): from app.utils.file_extensions import is_supported_document_extension - assert is_supported_document_extension(filename) is False, f"{filename} should NOT be supported" + assert is_supported_document_extension(filename) is False, ( + f"{filename} should NOT be supported" + ) # --------------------------------------------------------------------------- @@ -67,7 +105,7 @@ def test_union_equals_all_three_sets(): | LLAMAPARSE_DOCUMENT_EXTENSIONS | UNSTRUCTURED_DOCUMENT_EXTENSIONS ) - assert DOCUMENT_EXTENSIONS == expected + assert expected == DOCUMENT_EXTENSIONS def test_get_extensions_for_docling(): diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx index 1522e153f..957ae9dae 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx @@ -3,8 +3,8 @@ import { useEffect, useState } from "react"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; import { Label } from "@/components/ui/label"; -import { Switch } from "@/components/ui/switch"; import { Spinner } from "@/components/ui/spinner"; +import { Switch } from "@/components/ui/switch"; export function DesktopContent() { const [isElectron, setIsElectron] = useState(false); @@ -66,11 +66,7 @@ export function DesktopContent() { Show suggestions while typing in other applications.
-- If SurfSense doesn't appear in the list, click + and select it from Applications. + If SurfSense doesn't appear in the list, click + and + select it from Applications.
)} diff --git a/surfsense_web/app/desktop/suggestion/layout.tsx b/surfsense_web/app/desktop/suggestion/layout.tsx index 36b7e037b..fd8faf099 100644 --- a/surfsense_web/app/desktop/suggestion/layout.tsx +++ b/surfsense_web/app/desktop/suggestion/layout.tsx @@ -4,10 +4,6 @@ export const metadata = { title: "SurfSense Suggestion", }; -export default function SuggestionLayout({ - children, -}: { - children: React.ReactNode; -}) { +export default function SuggestionLayout({ children }: { children: React.ReactNode }) { return{suggestion}
Upload and sync your documents to your search space
diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx
index c147eede4..59781abcf 100644
--- a/surfsense_web/components/assistant-ui/image.tsx
+++ b/surfsense_web/components/assistant-ui/image.tsx
@@ -3,10 +3,10 @@
import type { ImageMessagePartComponent } from "@assistant-ui/react";
import { cva, type VariantProps } from "class-variance-authority";
import { ImageIcon, ImageOffIcon } from "lucide-react";
+import NextImage from "next/image";
import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react";
import { createPortal } from "react-dom";
import { cn } from "@/lib/utils";
-import NextImage from 'next/image';
const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", {
variants: {
@@ -88,23 +88,23 @@ function ImagePreview({
{thread.title || "New Chat"}
-- {relativeTime} -
+{relativeTime}
- {t("updated") || "Updated"}:{" "} - {format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")} -
-+ {t("updated") || "Updated"}:{" "} + {format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")} +
+- {t("updated") || "Updated"}:{" "} - {format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")} -
-+ {t("updated") || "Updated"}:{" "} + {format(new Date(thread.updatedAt), "MMM d, yyyy 'at' h:mm a")} +
+@@ -485,10 +485,15 @@ export function DocumentUploadTab({
{t("file_size_limit")}
{citation.title} @@ -341,18 +340,18 @@ function StackedCitations({ id, citations, className, onNavigate }: StackedCitat style={{ zIndex: maxIcons - index }} > {citation.favicon ? ( - - ) : ( - - )} + + ) : ( + + )}