diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index fbd2e4e73..5f1495cdb 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext class EtlPipelineService: """Single pipeline for extracting markdown from files. All callers use this.""" + def __init__(self, *, vision_llm=None): + self._vision_llm = vision_llm + async def extract(self, request: EtlRequest) -> EtlResult: category = classify_file(request.filename) @@ -47,6 +50,28 @@ class EtlPipelineService: content_type="audio", ) + if category == FileCategory.IMAGE: + return await self._extract_image(request) + + return await self._extract_document(request) + + async def _extract_image(self, request: EtlRequest) -> EtlResult: + if self._vision_llm: + from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm + + content = await parse_with_vision_llm( + request.file_path, request.filename, self._vision_llm + ) + return EtlResult( + markdown_content=content, + etl_service="VISION_LLM", + content_type="image", + ) + + logging.info( + "No vision LLM provided, falling back to document parser for %s", + request.filename, + ) return await self._extract_document(request) async def _extract_document(self, request: EtlRequest) -> EtlResult: diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py index 4e690bcdc..120369a27 100644 --- a/surfsense_backend/app/etl_pipeline/file_classifier.py +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -3,6 +3,7 @@ from pathlib import PurePosixPath from app.utils.file_extensions import ( DOCUMENT_EXTENSIONS, + IMAGE_EXTENSIONS, get_document_extensions_for_service, ) @@ -105,6 +106,7 @@ class FileCategory(Enum): PLAINTEXT = "plaintext" AUDIO = "audio" DIRECT_CONVERT = "direct_convert" + IMAGE = "image" UNSUPPORTED = "unsupported" DOCUMENT = "document" @@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory: return FileCategory.AUDIO if suffix in DIRECT_CONVERT_EXTENSIONS: return FileCategory.DIRECT_CONVERT + if suffix in IMAGE_EXTENSIONS: + return FileCategory.IMAGE if suffix in DOCUMENT_EXTENSIONS: return FileCategory.DOCUMENT return FileCategory.UNSUPPORTED @@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool: """Return True if *filename* cannot be processed by *etl_service*. Plaintext, audio, and direct-convert files are parser-agnostic and never - skipped. Document files are checked against the per-parser extension set. + skipped. Image and document files are checked against the per-parser + extension set (images fall back to the document parser when no vision LLM + is available, so the same service constraint applies). """ category = classify_file(filename) if category == FileCategory.UNSUPPORTED: return True - if category == FileCategory.DOCUMENT: + if category in (FileCategory.DOCUMENT, FileCategory.IMAGE): suffix = PurePosixPath(filename).suffix.lower() return suffix not in get_document_extensions_for_service(etl_service) return False diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py new file mode 100644 index 000000000..e75f81c4b --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py @@ -0,0 +1,37 @@ +import base64 +import mimetypes + +from langchain_core.messages import HumanMessage + +_PROMPT = ( + "Analyze this image thoroughly and produce a detailed markdown description.\n\n" + "Include:\n" + "- All visible text, transcribed verbatim\n" + "- Description of diagrams, charts, tables, or visual structures\n" + "- Key subjects, objects, or scenes depicted\n\n" + "Output only the markdown content, no preamble." +) + + +def _image_to_data_url(file_path: str) -> str: + mime_type, _ = mimetypes.guess_type(file_path) + if not mime_type or not mime_type.startswith("image/"): + mime_type = "image/png" + with open(file_path, "rb") as f: + encoded = base64.b64encode(f.read()).decode("ascii") + return f"data:{mime_type};base64,{encoded}" + + +async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str: + data_url = _image_to_data_url(file_path) + message = HumanMessage( + content=[ + {"type": "text", "text": _PROMPT}, + {"type": "image_url", "image_url": {"url": data_url}}, + ] + ) + response = await llm.ainvoke([message]) + text = response.content if hasattr(response, "content") else str(response) + if not text or not text.strip(): + raise ValueError(f"Vision LLM returned empty content for {filename}") + return text.strip() diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index c765dbd87..9992231e0 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -333,6 +333,7 @@ async def process_file_in_background( async def _extract_file_content( file_path: str, filename: str, + search_space_id: int, session: AsyncSession, user_id: str, task_logger: TaskLoggingService, @@ -360,6 +361,7 @@ async def _extract_file_content( FileCategory.PLAINTEXT: "Reading file", FileCategory.DIRECT_CONVERT: "Converting file", FileCategory.AUDIO: "Transcribing audio", + FileCategory.IMAGE: "Analyzing image", FileCategory.UNSUPPORTED: "Unsupported file type", FileCategory.DOCUMENT: "Extracting content", } @@ -383,7 +385,13 @@ async def _extract_file_content( estimated_pages = _estimate_pages_safe(page_limit_service, file_path) await page_limit_service.check_page_limit(user_id, estimated_pages) - result = await EtlPipelineService().extract( + vision_llm = None + if category == FileCategory.IMAGE: + from app.services.llm_service import get_vision_llm + + vision_llm = await get_vision_llm(session, search_space_id) + + result = await EtlPipelineService(vision_llm=vision_llm).extract( EtlRequest( file_path=file_path, filename=filename, @@ -439,6 +447,7 @@ async def process_file_in_background_with_document( markdown_content, etl_service = await _extract_file_content( file_path, filename, + search_space_id, session, user_id, task_logger, diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py index 16ac585b7..e8be1b83a 100644 --- a/surfsense_backend/app/utils/file_extensions.py +++ b/surfsense_backend/app/utils/file_extensions.py @@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these sets are exclusively for the "document" ETL path (Docling / LlamaParse / Unstructured). + +Image extensions intentionally remain in the per-parser sets for fallback +compatibility. IMAGE_EXTENSIONS is used only for routing classification. """ from pathlib import PurePosixPath +# --------------------------------------------------------------------------- +# Image extensions (used by file_classifier for routing to vision LLM) +# --------------------------------------------------------------------------- + +IMAGE_EXTENSIONS: frozenset[str] = frozenset( + { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + } +) + # --------------------------------------------------------------------------- # Per-parser document extension sets (from official documentation) # --------------------------------------------------------------------------- diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py index 9608b011d..4e1d603a3 100644 --- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename): ("doc.docx", "document"), ("slides.pptx", "document"), ("sheet.xlsx", "document"), - ("photo.png", "document"), - ("photo.jpg", "document"), + ("photo.png", "image"), + ("photo.jpg", "image"), + ("photo.webp", "image"), + ("photo.gif", "image"), + ("photo.heic", "image"), ("book.epub", "document"), ("letter.odt", "document"), ("readme.md", "plaintext"), @@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): await EtlPipelineService().extract( EtlRequest(file_path=str(eml_file), filename="mail.eml") ) + + +# --------------------------------------------------------------------------- +# Image extraction via vision LLM +# --------------------------------------------------------------------------- + + +async def test_extract_image_with_vision_llm(tmp_path): + """An image file is analyzed by the vision LLM when provided.""" + from unittest.mock import AsyncMock, MagicMock + + img_file = tmp_path / "photo.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + fake_response = MagicMock() + fake_response.content = "# A photo of a sunset over the ocean" + fake_llm = AsyncMock() + fake_llm.ainvoke.return_value = fake_response + + service = EtlPipelineService(vision_llm=fake_llm) + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="photo.png") + ) + + assert result.markdown_content == "# A photo of a sunset over the ocean" + assert result.etl_service == "VISION_LLM" + assert result.content_type == "image" + fake_llm.ainvoke.assert_called_once() + + +async def test_extract_image_falls_back_to_document_without_vision_llm( + tmp_path, mocker +): + """Without a vision LLM, image files fall back to the document parser.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "# OCR text from image"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + img_file = tmp_path / "scan.png" + img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50) + + service = EtlPipelineService() + result = await service.extract( + EtlRequest(file_path=str(img_file), filename="scan.png") + ) + + assert result.markdown_content == "# OCR text from image" + assert result.etl_service == "DOCLING" + assert result.content_type == "document" diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py index 43dfef5f0..ccf5eb70f 100644 --- a/surfsense_backend/tests/unit/utils/test_file_extensions.py +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union(): ) assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS + + +# --------------------------------------------------------------------------- +# IMAGE_EXTENSIONS +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "ext", + [ + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".heic", + ".heif", + ], +) +def test_image_extensions_contains_expected(ext): + from app.utils.file_extensions import IMAGE_EXTENSIONS + + assert ext in IMAGE_EXTENSIONS + + +def test_image_extensions_are_subset_of_document_extensions(): + """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback.""" + from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS + + missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS + assert not missing, ( + f"Image extensions missing from document sets (breaks fallback): {missing}" + )