chore: evals

2026-05-21 18:55:16 +02:00 · 2026-05-13 14:02:26 -07:00 · 2026-05-13 14:02:26 -07:00 · 3737118050
commit 3737118050
parent 2402b730fa
122 changed files with 22598 additions and 13 deletions
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@ -4,12 +4,34 @@ import os

 from langchain_core.messages import HumanMessage

+# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
+# A standalone image IS the document, so we want everything: visual
+# content plus any text the model can read off it. The output is
+# combined markdown that the chunker treats as the full document body.
 _PROMPT = (
    "Describe this image in markdown. "
    "Transcribe any visible text verbatim. "
    "Be concise but complete — let the image content guide the level of detail."
 )

+# Per-image-in-PDF prompt. Here the image is *inside* a larger
+# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
+# already running OCR over the whole page — including text rendered
+# into images. So we explicitly tell the model NOT to transcribe text
+# and to focus only on visual interpretation. This avoids paying
+# output tokens for OCR content the ETL pipeline already captured.
+_DESCRIPTION_PROMPT = (
+    "Describe what this image visually depicts in concise markdown. "
+    "Focus on visual content — anatomy, structures, charts, diagrams, "
+    "spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
+    "histology slide), and any clinically or structurally relevant "
+    "findings.\n\n"
+    "Do NOT transcribe text from the image. Any text in the image "
+    "(axis labels, annotations, scale bars, lab values, etc.) is "
+    "already extracted by a separate OCR pipeline; duplicating it "
+    "here would be redundant. Stick to the visual interpretation."
+)
+
 _MAX_IMAGE_BYTES = (
    5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
    return f"data:{mime_type};base64,{encoded}"


-async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
-    data_url = _image_to_data_url(file_path)
+async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
    message = HumanMessage(
        content=[
-            {"type": "text", "text": _PROMPT},
+            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
    )
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
    if not text or not text.strip():
        raise ValueError(f"Vision LLM returned empty content for {filename}")
    return text.strip()
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    """Single-shot: returns combined markdown for a standalone image upload.
+
+    Used when the operator uploads an image file directly (jpg/png/etc).
+    The image is the document, so the prompt asks for both visual
+    description and verbatim text in one go.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _PROMPT, data_url, filename)
+
+
+async def parse_image_for_description(
+    file_path: str, filename: str, llm
+) -> str:
+    """Visual-description-only call for per-image-in-PDF use.
+
+    Used by ``picture_describer`` when an image is embedded inside a
+    larger document. Returns a markdown description of what the image
+    visually depicts; deliberately does NOT include text-in-image OCR
+    because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
+    already running OCR over the entire page and would duplicate that
+    text content.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
+
+
+__all__ = [
+    "parse_image_for_description",
+    "parse_with_vision_llm",
+]