SurfSense/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py

import base64
import mimetypes
import os

from langchain_core.messages import HumanMessage

_PROMPT = (
    "Describe this image in markdown. "
    "Transcribe any visible text verbatim. "
    "Be concise but complete — let the image content guide the level of detail."
)

_MAX_IMAGE_BYTES = 5 * 1024 * 1024  # 5 MB (Anthropic Claude's limit, the most restrictive)


def _image_to_data_url(file_path: str) -> str:
    file_size = os.path.getsize(file_path)
    if file_size > _MAX_IMAGE_BYTES:
        raise ValueError(
            f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
            f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
        )
    mime_type, _ = mimetypes.guess_type(file_path)
    if not mime_type or not mime_type.startswith("image/"):
        mime_type = "image/png"
    with open(file_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("ascii")
    return f"data:{mime_type};base64,{encoded}"


async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
    data_url = _image_to_data_url(file_path)
    message = HumanMessage(
        content=[
            {"type": "text", "text": _PROMPT},
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
    )
    response = await llm.ainvoke([message])
    text = response.content if hasattr(response, "content") else str(response)
    if not text or not text.strip():
        raise ValueError(f"Vision LLM returned empty content for {filename}")
    return text.strip()
Route uploaded images to vision LLM with document-parser fallback 2026-04-09 14:33:33 +02:00			`import base64`
			`import mimetypes`
Add 5MB file size guard before base64 encoding for vision LLM 2026-04-09 15:17:08 +02:00			`import os`
Route uploaded images to vision LLM with document-parser fallback 2026-04-09 14:33:33 +02:00
			`from langchain_core.messages import HumanMessage`

			`_PROMPT = (`
Simplify vision LLM image description prompt 2026-04-09 14:56:18 +02:00			`"Describe this image in markdown. "`
			`"Transcribe any visible text verbatim. "`
			`"Be concise but complete — let the image content guide the level of detail."`
Route uploaded images to vision LLM with document-parser fallback 2026-04-09 14:33:33 +02:00			`)`

Add 5MB file size guard before base64 encoding for vision LLM 2026-04-09 15:17:08 +02:00			`_MAX_IMAGE_BYTES = 5 * 1024 * 1024 # 5 MB (Anthropic Claude's limit, the most restrictive)`

Route uploaded images to vision LLM with document-parser fallback 2026-04-09 14:33:33 +02:00
			`def _image_to_data_url(file_path: str) -> str:`
Add 5MB file size guard before base64 encoding for vision LLM 2026-04-09 15:17:08 +02:00			`file_size = os.path.getsize(file_path)`
			`if file_size > _MAX_IMAGE_BYTES:`
			`raise ValueError(`
			`f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "`
			`f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"`
			`)`
Route uploaded images to vision LLM with document-parser fallback 2026-04-09 14:33:33 +02:00			`mime_type, _ = mimetypes.guess_type(file_path)`
			`if not mime_type or not mime_type.startswith("image/"):`
			`mime_type = "image/png"`
			`with open(file_path, "rb") as f:`
			`encoded = base64.b64encode(f.read()).decode("ascii")`
			`return f"data:{mime_type};base64,{encoded}"`


			`async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:`
			`data_url = _image_to_data_url(file_path)`
			`message = HumanMessage(`
			`content=[`
			`{"type": "text", "text": _PROMPT},`
			`{"type": "image_url", "image_url": {"url": data_url}},`
			`]`
			`)`
			`response = await llm.ainvoke([message])`
			`text = response.content if hasattr(response, "content") else str(response)`
			`if not text or not text.strip():`
			`raise ValueError(f"Vision LLM returned empty content for {filename}")`
			`return text.strip()`