Route uploaded images to vision LLM with document-parser fallback

2026-05-08 23:32:40 +02:00 · 2026-04-09 14:33:33 +02:00 · 2026-04-09 14:33:33 +02:00 · 7e90a8ed3c
commit 7e90a8ed3c
parent 78fa2d926a
7 changed files with 199 additions and 5 deletions
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -0,0 +1,37 @@
+import base64
+import mimetypes
+
+from langchain_core.messages import HumanMessage
+
+_PROMPT = (
+    "Analyze this image thoroughly and produce a detailed markdown description.\n\n"
+    "Include:\n"
+    "- All visible text, transcribed verbatim\n"
+    "- Description of diagrams, charts, tables, or visual structures\n"
+    "- Key subjects, objects, or scenes depicted\n\n"
+    "Output only the markdown content, no preamble."
+)
+
+
+def _image_to_data_url(file_path: str) -> str:
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if not mime_type or not mime_type.startswith("image/"):
+        mime_type = "image/png"
+    with open(file_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    data_url = _image_to_data_url(file_path)
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": _PROMPT},
+            {"type": "image_url", "image_url": {"url": data_url}},
+        ]
+    )
+    response = await llm.ainvoke([message])
+    text = response.content if hasattr(response, "content") else str(response)
+    if not text or not text.strip():
+        raise ValueError(f"Vision LLM returned empty content for {filename}")
+    return text.strip()