Route uploaded images to vision LLM with document-parser fallback

2026-07-12 22:42:13 +02:00 · 2026-04-09 14:33:33 +02:00 · 2026-04-09 14:33:33 +02:00 · 7e90a8ed3c
commit 7e90a8ed3c
parent 78fa2d926a
7 changed files with 199 additions and 5 deletions
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
 class EtlPipelineService:
    """Single pipeline for extracting markdown from files. All callers use this."""

+    def __init__(self, *, vision_llm=None):
+        self._vision_llm = vision_llm
+
    async def extract(self, request: EtlRequest) -> EtlResult:
        category = classify_file(request.filename)

@ -47,6 +50,28 @@ class EtlPipelineService:
                content_type="audio",
            )

+        if category == FileCategory.IMAGE:
+            return await self._extract_image(request)
+
+        return await self._extract_document(request)
+
+    async def _extract_image(self, request: EtlRequest) -> EtlResult:
+        if self._vision_llm:
+            from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+            content = await parse_with_vision_llm(
+                request.file_path, request.filename, self._vision_llm
+            )
+            return EtlResult(
+                markdown_content=content,
+                etl_service="VISION_LLM",
+                content_type="image",
+            )
+
+        logging.info(
+            "No vision LLM provided, falling back to document parser for %s",
+            request.filename,
+        )
        return await self._extract_document(request)

    async def _extract_document(self, request: EtlRequest) -> EtlResult:
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -3,6 +3,7 @@ from pathlib import PurePosixPath

 from app.utils.file_extensions import (
    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
    get_document_extensions_for_service,
 )

@ -105,6 +106,7 @@ class FileCategory(Enum):
    PLAINTEXT = "plaintext"
    AUDIO = "audio"
    DIRECT_CONVERT = "direct_convert"
+    IMAGE = "image"
    UNSUPPORTED = "unsupported"
    DOCUMENT = "document"

@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
        return FileCategory.AUDIO
    if suffix in DIRECT_CONVERT_EXTENSIONS:
        return FileCategory.DIRECT_CONVERT
+    if suffix in IMAGE_EXTENSIONS:
+        return FileCategory.IMAGE
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
    """Return True if *filename* cannot be processed by *etl_service*.

    Plaintext, audio, and direct-convert files are parser-agnostic and never
-    skipped.  Document files are checked against the per-parser extension set.
+    skipped.  Image and document files are checked against the per-parser
+    extension set (images fall back to the document parser when no vision LLM
+    is available, so the same service constraint applies).
    """
    category = classify_file(filename)
    if category == FileCategory.UNSUPPORTED:
        return True
-    if category == FileCategory.DOCUMENT:
+    if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
        suffix = PurePosixPath(filename).suffix.lower()
        return suffix not in get_document_extensions_for_service(etl_service)
    return False
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -0,0 +1,37 @@
+import base64
+import mimetypes
+
+from langchain_core.messages import HumanMessage
+
+_PROMPT = (
+    "Analyze this image thoroughly and produce a detailed markdown description.\n\n"
+    "Include:\n"
+    "- All visible text, transcribed verbatim\n"
+    "- Description of diagrams, charts, tables, or visual structures\n"
+    "- Key subjects, objects, or scenes depicted\n\n"
+    "Output only the markdown content, no preamble."
+)
+
+
+def _image_to_data_url(file_path: str) -> str:
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if not mime_type or not mime_type.startswith("image/"):
+        mime_type = "image/png"
+    with open(file_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    data_url = _image_to_data_url(file_path)
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": _PROMPT},
+            {"type": "image_url", "image_url": {"url": data_url}},
+        ]
+    )
+    response = await llm.ainvoke([message])
+    text = response.content if hasattr(response, "content") else str(response)
+    if not text or not text.strip():
+        raise ValueError(f"Vision LLM returned empty content for {filename}")
+    return text.strip()
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -333,6 +333,7 @@ async def process_file_in_background(
 async def _extract_file_content(
    file_path: str,
    filename: str,
+    search_space_id: int,
    session: AsyncSession,
    user_id: str,
    task_logger: TaskLoggingService,
@ -360,6 +361,7 @@ async def _extract_file_content(
            FileCategory.PLAINTEXT: "Reading file",
            FileCategory.DIRECT_CONVERT: "Converting file",
            FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.IMAGE: "Analyzing image",
            FileCategory.UNSUPPORTED: "Unsupported file type",
            FileCategory.DOCUMENT: "Extracting content",
        }
@ -383,7 +385,13 @@ async def _extract_file_content(
        estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
        await page_limit_service.check_page_limit(user_id, estimated_pages)

-    result = await EtlPipelineService().extract(
+    vision_llm = None
+    if category == FileCategory.IMAGE:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(
            file_path=file_path,
            filename=filename,
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
        markdown_content, etl_service = await _extract_file_content(
            file_path,
            filename,
+            search_space_id,
            session,
            user_id,
            task_logger,
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
+
+Image extensions intentionally remain in the per-parser sets for fallback
+compatibility.  IMAGE_EXTENSIONS is used only for routing classification.
 """

 from pathlib import PurePosixPath

+# ---------------------------------------------------------------------------
+# Image extensions (used by file_classifier for routing to vision LLM)
+# ---------------------------------------------------------------------------
+
+IMAGE_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    }
+)
+
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
        ("doc.docx", "document"),
        ("slides.pptx", "document"),
        ("sheet.xlsx", "document"),
-        ("photo.png", "document"),
-        ("photo.jpg", "document"),
+        ("photo.png", "image"),
+        ("photo.jpg", "image"),
+        ("photo.webp", "image"),
+        ("photo.gif", "image"),
+        ("photo.heic", "image"),
        ("book.epub", "document"),
        ("letter.odt", "document"),
        ("readme.md", "plaintext"),
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
        await EtlPipelineService().extract(
            EtlRequest(file_path=str(eml_file), filename="mail.eml")
        )
+
+
+# ---------------------------------------------------------------------------
+# Image extraction via vision LLM
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_image_with_vision_llm(tmp_path):
+    """An image file is analyzed by the vision LLM when provided."""
+    from unittest.mock import AsyncMock, MagicMock
+
+    img_file = tmp_path / "photo.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A photo of a sunset over the ocean"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    service = EtlPipelineService(vision_llm=fake_llm)
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="photo.png")
+    )
+
+    assert result.markdown_content == "# A photo of a sunset over the ocean"
+    assert result.etl_service == "VISION_LLM"
+    assert result.content_type == "image"
+    fake_llm.ainvoke.assert_called_once()
+
+
+async def test_extract_image_falls_back_to_document_without_vision_llm(
+    tmp_path, mocker
+):
+    """Without a vision LLM, image files fall back to the document parser."""
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# OCR text from image"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    img_file = tmp_path / "scan.png"
+    img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
+
+    service = EtlPipelineService()
+    result = await service.extract(
+        EtlRequest(file_path=str(img_file), filename="scan.png")
+    )
+
+    assert result.markdown_content == "# OCR text from image"
+    assert result.etl_service == "DOCLING"
+    assert result.content_type == "document"
--- a/surfsense_backend/tests/unit/utils/test_file_extensions.py
+++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py
@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
    )

    assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
+
+
+# ---------------------------------------------------------------------------
+# IMAGE_EXTENSIONS
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "ext",
+    [
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    ],
+)
+def test_image_extensions_contains_expected(ext):
+    from app.utils.file_extensions import IMAGE_EXTENSIONS
+
+    assert ext in IMAGE_EXTENSIONS
+
+
+def test_image_extensions_are_subset_of_document_extensions():
+    """Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
+    from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
+
+    missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
+    assert not missing, (
+        f"Image extensions missing from document sets (breaks fallback): {missing}"
+    )