Route uploaded images to vision LLM with document-parser fallback

2026-06-26 21:39:43 +02:00 · 2026-04-09 14:33:33 +02:00 · 2026-04-09 14:33:33 +02:00 · 7e90a8ed3c
commit 7e90a8ed3c
parent 78fa2d926a
7 changed files with 199 additions and 5 deletions
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
 class EtlPipelineService:
    """Single pipeline for extracting markdown from files. All callers use this."""

+    def __init__(self, *, vision_llm=None):
+        self._vision_llm = vision_llm
+
    async def extract(self, request: EtlRequest) -> EtlResult:
        category = classify_file(request.filename)

@ -47,6 +50,28 @@ class EtlPipelineService:
                content_type="audio",
            )

+        if category == FileCategory.IMAGE:
+            return await self._extract_image(request)
+
+        return await self._extract_document(request)
+
+    async def _extract_image(self, request: EtlRequest) -> EtlResult:
+        if self._vision_llm:
+            from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+            content = await parse_with_vision_llm(
+                request.file_path, request.filename, self._vision_llm
+            )
+            return EtlResult(
+                markdown_content=content,
+                etl_service="VISION_LLM",
+                content_type="image",
+            )
+
+        logging.info(
+            "No vision LLM provided, falling back to document parser for %s",
+            request.filename,
+        )
        return await self._extract_document(request)

    async def _extract_document(self, request: EtlRequest) -> EtlResult:
--- a/surfsense_backend/app/etl_pipeline/file_classifier.py
+++ b/surfsense_backend/app/etl_pipeline/file_classifier.py
@ -3,6 +3,7 @@ from pathlib import PurePosixPath

 from app.utils.file_extensions import (
    DOCUMENT_EXTENSIONS,
+    IMAGE_EXTENSIONS,
    get_document_extensions_for_service,
 )

@ -105,6 +106,7 @@ class FileCategory(Enum):
    PLAINTEXT = "plaintext"
    AUDIO = "audio"
    DIRECT_CONVERT = "direct_convert"
+    IMAGE = "image"
    UNSUPPORTED = "unsupported"
    DOCUMENT = "document"

@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
        return FileCategory.AUDIO
    if suffix in DIRECT_CONVERT_EXTENSIONS:
        return FileCategory.DIRECT_CONVERT
+    if suffix in IMAGE_EXTENSIONS:
+        return FileCategory.IMAGE
    if suffix in DOCUMENT_EXTENSIONS:
        return FileCategory.DOCUMENT
    return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
    """Return True if *filename* cannot be processed by *etl_service*.

    Plaintext, audio, and direct-convert files are parser-agnostic and never
-    skipped.  Document files are checked against the per-parser extension set.
+    skipped.  Image and document files are checked against the per-parser
+    extension set (images fall back to the document parser when no vision LLM
+    is available, so the same service constraint applies).
    """
    category = classify_file(filename)
    if category == FileCategory.UNSUPPORTED:
        return True
-    if category == FileCategory.DOCUMENT:
+    if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
        suffix = PurePosixPath(filename).suffix.lower()
        return suffix not in get_document_extensions_for_service(etl_service)
    return False
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -0,0 +1,37 @@
+import base64
+import mimetypes
+
+from langchain_core.messages import HumanMessage
+
+_PROMPT = (
+    "Analyze this image thoroughly and produce a detailed markdown description.\n\n"
+    "Include:\n"
+    "- All visible text, transcribed verbatim\n"
+    "- Description of diagrams, charts, tables, or visual structures\n"
+    "- Key subjects, objects, or scenes depicted\n\n"
+    "Output only the markdown content, no preamble."
+)
+
+
+def _image_to_data_url(file_path: str) -> str:
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if not mime_type or not mime_type.startswith("image/"):
+        mime_type = "image/png"
+    with open(file_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    data_url = _image_to_data_url(file_path)
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": _PROMPT},
+            {"type": "image_url", "image_url": {"url": data_url}},
+        ]
+    )
+    response = await llm.ainvoke([message])
+    text = response.content if hasattr(response, "content") else str(response)
+    if not text or not text.strip():
+        raise ValueError(f"Vision LLM returned empty content for {filename}")
+    return text.strip()
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -333,6 +333,7 @@ async def process_file_in_background(
 async def _extract_file_content(
    file_path: str,
    filename: str,
+    search_space_id: int,
    session: AsyncSession,
    user_id: str,
    task_logger: TaskLoggingService,
@ -360,6 +361,7 @@ async def _extract_file_content(
            FileCategory.PLAINTEXT: "Reading file",
            FileCategory.DIRECT_CONVERT: "Converting file",
            FileCategory.AUDIO: "Transcribing audio",
+            FileCategory.IMAGE: "Analyzing image",
            FileCategory.UNSUPPORTED: "Unsupported file type",
            FileCategory.DOCUMENT: "Extracting content",
        }
@ -383,7 +385,13 @@ async def _extract_file_content(
        estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
        await page_limit_service.check_page_limit(user_id, estimated_pages)

-    result = await EtlPipelineService().extract(
+    vision_llm = None
+    if category == FileCategory.IMAGE:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(session, search_space_id)
+
+    result = await EtlPipelineService(vision_llm=vision_llm).extract(
        EtlRequest(
            file_path=file_path,
            filename=filename,
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
        markdown_content, etl_service = await _extract_file_content(
            file_path,
            filename,
+            search_space_id,
            session,
            user_id,
            task_logger,
--- a/surfsense_backend/app/utils/file_extensions.py
+++ b/surfsense_backend/app/utils/file_extensions.py
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
 DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
 sets are exclusively for the "document" ETL path (Docling / LlamaParse /
 Unstructured).
+
+Image extensions intentionally remain in the per-parser sets for fallback
+compatibility.  IMAGE_EXTENSIONS is used only for routing classification.
 """

 from pathlib import PurePosixPath

+# ---------------------------------------------------------------------------
+# Image extensions (used by file_classifier for routing to vision LLM)
+# ---------------------------------------------------------------------------
+
+IMAGE_EXTENSIONS: frozenset[str] = frozenset(
+    {
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".tiff",
+        ".tif",
+        ".webp",
+        ".svg",
+        ".heic",
+        ".heif",
+    }
+)
+
 # ---------------------------------------------------------------------------
 # Per-parser document extension sets (from official documentation)
 # ---------------------------------------------------------------------------