chore: evals

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-13 14:02:26 -07:00
parent 2402b730fa
commit 3737118050
122 changed files with 22598 additions and 13 deletions

View file

@ -134,12 +134,92 @@ class EtlPipelineService:
else:
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
# When the operator opts into vision-LLM at ingest, walk the
# original file's embedded images and append a structured
# "Image Content" section. The parser's own OCR (Docling
# do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
# image; this side handles the *visual* description which the
# parsers all drop today.
content = await self._maybe_append_picture_descriptions(request, content)
return EtlResult(
markdown_content=content,
etl_service=etl_service,
content_type="document",
)
async def _maybe_append_picture_descriptions(
self, request: EtlRequest, markdown: str
) -> str:
if self._vision_llm is None:
return markdown
from app.etl_pipeline.picture_describer import (
describe_pictures,
merge_descriptions_into_markdown,
)
# Per-image OCR runner: re-feed each extracted image through
# the ETL pipeline *as a standalone image* (no vision LLM, so
# the IMAGE branch falls through to the document parser, which
# OCRs the image with the configured backend -- Docling /
# Azure DI / LlamaCloud). This gives us per-image OCR text
# attached to the inline image block, in addition to the
# page-level OCR that the parser already merges into the main
# markdown stream. The fresh sub-service gets vision_llm=None
# so this call cannot recurse back into picture_describer.
async def _ocr_image(image_path: str, image_name: str) -> str:
try:
sub = EtlPipelineService(vision_llm=None)
ocr_result = await sub.extract(
EtlRequest(file_path=image_path, filename=image_name)
)
except (
EtlUnsupportedFileError,
EtlServiceUnavailableError,
) as exc:
# Common case: the configured ETL service can't OCR
# this image format (or no service is configured at
# all). Don't spam warnings -- just no OCR for it.
logging.debug(
"Skipping per-image OCR for %s: %s", image_name, exc
)
return ""
return ocr_result.markdown_content
try:
result = await describe_pictures(
request.file_path,
request.filename,
self._vision_llm,
ocr_runner=_ocr_image,
)
except Exception:
# Picture description is additive; never let it fail an
# otherwise-successful document extraction.
logging.warning(
"Picture description failed for %s, returning parser output unchanged",
request.filename,
exc_info=True,
)
return markdown
if not result.descriptions:
return markdown
merged = merge_descriptions_into_markdown(markdown, result)
logging.info(
"Vision LLM described %d image(s) in %s "
"(skipped: %d small / %d large / %d duplicate, %d failed)",
len(result.descriptions),
request.filename,
result.skipped_too_small,
result.skipped_too_large,
result.skipped_duplicate,
result.failed,
)
return merged
async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
"""Try Azure Document Intelligence first (when configured) then LlamaCloud.

View file

@ -4,12 +4,34 @@ import os
from langchain_core.messages import HumanMessage
# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
# A standalone image IS the document, so we want everything: visual
# content plus any text the model can read off it. The output is
# combined markdown that the chunker treats as the full document body.
_PROMPT = (
"Describe this image in markdown. "
"Transcribe any visible text verbatim. "
"Be concise but complete — let the image content guide the level of detail."
)
# Per-image-in-PDF prompt. Here the image is *inside* a larger
# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
# already running OCR over the whole page — including text rendered
# into images. So we explicitly tell the model NOT to transcribe text
# and to focus only on visual interpretation. This avoids paying
# output tokens for OCR content the ETL pipeline already captured.
_DESCRIPTION_PROMPT = (
"Describe what this image visually depicts in concise markdown. "
"Focus on visual content — anatomy, structures, charts, diagrams, "
"spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
"histology slide), and any clinically or structurally relevant "
"findings.\n\n"
"Do NOT transcribe text from the image. Any text in the image "
"(axis labels, annotations, scale bars, lab values, etc.) is "
"already extracted by a separate OCR pipeline; duplicating it "
"here would be redundant. Stick to the visual interpretation."
)
_MAX_IMAGE_BYTES = (
5 * 1024 * 1024
) # 5 MB (Anthropic Claude's limit, the most restrictive)
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
"""Single-shot: returns combined markdown for a standalone image upload.
Used when the operator uploads an image file directly (jpg/png/etc).
The image is the document, so the prompt asks for both visual
description and verbatim text in one go.
"""
data_url = _image_to_data_url(file_path)
return await _invoke_vision(llm, _PROMPT, data_url, filename)
async def parse_image_for_description(
file_path: str, filename: str, llm
) -> str:
"""Visual-description-only call for per-image-in-PDF use.
Used by ``picture_describer`` when an image is embedded inside a
larger document. Returns a markdown description of what the image
visually depicts; deliberately does NOT include text-in-image OCR
because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
already running OCR over the entire page and would duplicate that
text content.
"""
data_url = _image_to_data_url(file_path)
return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
__all__ = [
"parse_image_for_description",
"parse_with_vision_llm",
]

View file

@ -0,0 +1,678 @@
"""Extract embedded images from PDFs, describe them, and inject the
descriptions inline into the parser's markdown.
When the operator passes ``use_vision_llm=True`` for a PDF, the document
parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
but mostly drop the actual image content -- a CT scan inside a clinical
PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
and the caption text below it.
This module fills that gap. After the document parser produces markdown
text, we:
1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
image (deduped by sha256, size-capped to match the vision LLM's own
limits).
2. Run the vision LLM on each unique image (visual description) and,
in parallel when an OCR runner is provided, re-feed the same image
through the ETL service for per-image OCR.
3. **Inject** a horizontal-rule-delimited markdown section -- with
named "OCR text" and "Visual description" sub-sections -- where the
image actually appears in the parser's markdown. Two splice modes,
chosen by which marker the parser emitted:
- **Replace** Docling-style ``<!-- image -->`` placeholders (and an
optional ``Image: <filename>`` caption line). The placeholder
carries no useful content of its own, so we substitute our block
for it.
- **Append after** layout-aware ``<figure>...</figure>`` blocks
(Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
already contain parser-extracted chart values / OCR'd labels /
captions, which are themselves useful for retrieval -- so we
PRESERVE the figure verbatim and add our vision-LLM block
immediately after it. The chunk then contains both the parser's
structured numbers AND the VLM's semantic interpretation.
Either way, the image content stays in context with the surrounding
document body rather than getting orphaned at the end -- crucial for
retrieval, where a single chunk should contain the question, the
image content, and the answer options together.
If no placeholders, figures, or captions can be matched (e.g. an
unusual parser output), we fall back to appending an
``## Image Content`` section so no image content is silently lost.
"""
from __future__ import annotations
import asyncio
import contextlib
import hashlib
import logging
import re
import tempfile
from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
# Type alias for the OCR callback. Takes (file_path, filename), returns
# the OCR'd markdown text -- or empty string if no text was found, or
# raises if OCR failed unrecoverably (which the describer catches and
# treats as "no OCR for this image" rather than failing the whole doc).
OcrRunner = Callable[[str, str], Awaitable[str]]
logger = logging.getLogger(__name__)
# Bound how many vision LLM calls we make in parallel for a single
# document. Vision models are typically rate-limited; 4 concurrent
# calls is a safe default that respects most provider limits while
# keeping wall-clock manageable for image-heavy PDFs.
_VISION_CONCURRENCY = 4
# Match parse_with_vision_llm's per-image cap so we don't even attempt
# images that the vision LLM would reject anyway (Anthropic's 5 MB
# limit is the most restrictive among the major providers).
_MAX_IMAGE_BYTES = 5 * 1024 * 1024
# Skip degenerate images: tracking pixels, very small decorative dots,
# scanner-introduced artefacts. We can't cheaply check pixel dimensions
# without decoding the image, so we approximate: anything under 1 KB is
# almost certainly not informative content.
_MIN_IMAGE_BYTES = 1024
@dataclass
class PictureDescription:
"""A single extracted image with its visual description and (optionally) OCR.
Two content fields by design, each produced by the *right* tool:
- ``description``: the vision LLM's visual interpretation. What the
image depicts (anatomy, charts, layout, etc.) -- the semantic
content that only a vision model can produce.
- ``ocr_text``: text-in-image extracted by re-feeding the image
through the configured ETL service (Docling/Azure DI/LlamaCloud)
*as if it were a standalone image upload*. Specialist OCR engine,
per-image attribution, no vision LLM tokens spent on text. None
when no OCR was requested or OCR found no text.
"""
page_number: int # 1-indexed
ordinal_in_page: int # 0-indexed within the page
name: str # name pypdf assigned (e.g. "Im0")
sha256: str # hash of the raw image bytes
description: str # visual description (markdown)
ocr_text: str | None = None # OCR text from the ETL service, if any
@dataclass
class PictureExtractionResult:
"""Aggregate result of extracting all pictures from a document."""
descriptions: list[PictureDescription] = field(default_factory=list)
skipped_too_small: int = 0
skipped_too_large: int = 0
skipped_duplicate: int = 0
failed: int = 0
@property
def has_content(self) -> bool:
return bool(self.descriptions)
def _is_pdf(filename: str) -> bool:
return filename.lower().endswith(".pdf")
def _pick_suffix(name: str) -> str:
lower = name.lower()
for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
if lower.endswith(ext):
return ".jpeg" if ext == ".jpg" else ext
return ".png"
def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
"""Pull every embedded image out of a PDF.
Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
Per-page and per-image failures are logged and skipped -- one bad
image must not fail the whole document.
"""
from pypdf import PdfReader
out: list[tuple[int, int, str, bytes]] = []
try:
reader = PdfReader(file_path)
except Exception:
logger.warning(
"pypdf failed to open %s for image extraction",
file_path,
exc_info=True,
)
return out
for page_idx, page in enumerate(reader.pages):
try:
images = list(page.images)
except Exception:
logger.warning(
"pypdf failed to enumerate images on page %d of %s",
page_idx + 1,
file_path,
exc_info=True,
)
continue
for img_idx, img in enumerate(images):
try:
name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
data = img.data
except Exception:
logger.warning(
"pypdf failed to read image %d on page %d of %s",
img_idx,
page_idx + 1,
file_path,
exc_info=True,
)
continue
out.append((page_idx + 1, img_idx, name, data))
return out
async def _describe_one(
page_number: int,
ordinal: int,
name: str,
sha256: str,
data: bytes,
vision_llm: Any,
semaphore: asyncio.Semaphore,
ocr_runner: OcrRunner | None,
) -> PictureDescription | None:
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
suffix = _pick_suffix(name)
# NamedTemporaryFile + delete=False because the vision-LLM helper
# and the OCR runner each open the path themselves; we clean up in
# the finally. Same temp file feeds both, which is correct: vision
# LLM and OCR are looking at the same image, just asking different
# questions of it.
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(data)
tmp_path = tmp.name
try:
async with semaphore:
tasks: list[Awaitable[Any]] = [
parse_image_for_description(tmp_path, name, vision_llm),
]
if ocr_runner is not None:
tasks.append(ocr_runner(tmp_path, name))
# return_exceptions=True so a failure in one branch (most
# often OCR) doesn't poison the other.
results = await asyncio.gather(*tasks, return_exceptions=True)
description_result = results[0]
if isinstance(description_result, BaseException):
logger.warning(
"Vision LLM failed for image %s on page %d, skipping",
name,
page_number,
exc_info=description_result,
)
return None
description = str(description_result)
ocr_text: str | None = None
if ocr_runner is not None and len(results) > 1:
ocr_result = results[1]
if isinstance(ocr_result, BaseException):
logger.warning(
"Per-image OCR failed for image %s on page %d, "
"omitting OCR field for this image",
name,
page_number,
exc_info=ocr_result,
)
else:
stripped = str(ocr_result).strip()
# Empty OCR (or whitespace-only) means the OCR engine
# found no text in this image. Record that as None so
# the rendered block doesn't include a useless empty tag.
ocr_text = stripped or None
finally:
with contextlib.suppress(OSError):
Path(tmp_path).unlink()
return PictureDescription(
page_number=page_number,
ordinal_in_page=ordinal,
name=name,
sha256=sha256,
description=description,
ocr_text=ocr_text,
)
async def describe_pictures(
file_path: str,
filename: str,
vision_llm: Any,
*,
ocr_runner: OcrRunner | None = None,
) -> PictureExtractionResult:
"""Extract embedded images from a document and describe each via vision LLM.
When ``ocr_runner`` is provided, each image is also passed to it
(in parallel with the vision LLM) and the returned text is recorded
in :attr:`PictureDescription.ocr_text`. The runner is typically a
closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
the same OCR engine that processes standalone image uploads
(Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
giving per-image OCR attribution alongside the page-level OCR that
the parser already does.
Currently PDF-only. For non-PDF documents this returns an empty
result and the caller should leave the parser's markdown untouched.
"""
result = PictureExtractionResult()
if not _is_pdf(filename) or vision_llm is None:
return result
raw_images = _extract_pdf_images(file_path)
if not raw_images:
return result
seen_hashes: set[str] = set()
eligible: list[tuple[int, int, str, str, bytes]] = []
for page_number, ordinal, name, data in raw_images:
if len(data) > _MAX_IMAGE_BYTES:
result.skipped_too_large += 1
continue
if len(data) < _MIN_IMAGE_BYTES:
result.skipped_too_small += 1
continue
sha = hashlib.sha256(data).hexdigest()
if sha in seen_hashes:
result.skipped_duplicate += 1
continue
seen_hashes.add(sha)
eligible.append((page_number, ordinal, name, sha, data))
if not eligible:
return result
semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
tasks = [
_describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
for (p, o, n, sha, d) in eligible
]
descriptions = await asyncio.gather(*tasks)
for desc in descriptions:
if desc is None:
result.failed += 1
else:
result.descriptions.append(desc)
return result
# ---------------------------------------------------------------------------
# Rendering: build the per-image markdown block + inject inline.
# ---------------------------------------------------------------------------
def _format_image_block(
name: str,
description: str,
ocr_text: str | None = None,
) -> str:
"""Render the per-image block as a horizontal-rule-delimited section.
Why no blockquote / no raw HTML / no XML?
-----------------------------------------
We tried each in turn and each failed in the document viewer:
- **Raw HTML / XML** (``<image>...</image>``): unknown elements
have no render rules in Streamdown or PlateJS, so the content
survives in the markdown source but is invisible to humans.
- **Blockquote with nested blocks**: nested fenced code blocks,
bullet lists, numbered lists, tables -- any *block* element
inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
/ remark, dropping everything after it onto the document level.
The vision LLM happily produces bulleted descriptions, so this
hit the viewer in practice.
A horizontal-rule-delimited section, by contrast, contains only
standard top-level markdown -- bold labels and free-form body --
so the description's native markdown (lists, prose, tables) all
renders natively in every renderer.
Layout (OCR section omitted when ``ocr_text`` is None/empty):
---
**Embedded image:** `MM-130-a.jpeg`
**OCR text:**
Slice 24 / 60
L
R
**Visual description:**
- Axial contrast-enhanced CT showing a large cystic mass...
- Mass effect on the adjacent stomach.
---
Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
Returned with leading and trailing blank-line padding so the rules
never merge with adjacent paragraphs after splicing.
"""
parts: list[str] = [f"**Embedded image:** `{name}`"]
if ocr_text and ocr_text.strip():
# Bold "OCR text:" label with trailing two spaces (=> <br>) so
# the first OCR line sits directly under the label rather than
# forcing a paragraph break that some renderers would style
# differently. Subsequent OCR lines also use trailing two spaces
# for hard breaks, so multi-line OCR renders line-by-line
# without needing a (fragile) fenced code block.
ocr_clean_lines = [
ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
]
parts.append("")
parts.append("**OCR text:** ")
for i, raw in enumerate(ocr_clean_lines):
suffix = "" if i == len(ocr_clean_lines) - 1 else " "
parts.append(f"{raw}{suffix}")
parts.append("")
parts.append("**Visual description:**")
parts.append("")
parts.append(description.strip())
body = "\n".join(parts)
# Wrap with blank lines + horizontal rules so the block is clearly
# delimited from surrounding paragraphs and survives splicing into
# the middle of any markdown stream.
return "\n\n---\n\n" + body + "\n\n---\n\n"
# Patterns we'll try to splice into. Each pattern captures the
# original-PDF filename when one is available (group 1).
#
# Replace-style markers (the matched span is substituted with our block
# because it carries no useful content of its own):
#
# 1. Docling's image placeholder followed by an "Image: <filename>"
# caption line. This is what our medxpertqa renderer produces:
# reportlab places the JPEG, then a caption, and Docling outputs
# the placeholder + caption.
# 2. Docling's image placeholder alone (filename unknown -- we fall
# back to pypdf's name).
# 3. A bare "Image: <filename>" caption line with no preceding
# placeholder. Rare in practice, but covers parsers that drop the
# placeholder entirely.
_PLACEHOLDER_WITH_CAPTION = re.compile(
r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
re.IGNORECASE,
)
_PLACEHOLDER_ONLY = re.compile(
r"<!--\s*image\s*-->",
re.IGNORECASE,
)
_CAPTION_ONLY = re.compile(
r"^[ \t]*Image:\s*(\S+)\s*$",
re.IGNORECASE | re.MULTILINE,
)
# Append-after marker (the matched span is preserved verbatim and our
# block is inserted immediately after it):
#
# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
# Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
# The figure's own contents -- chart bar values, axis labels,
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures
# -- are themselves specialist OCR output, so we keep them and add
# our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
# optional attributes like ``<figure id="...">``; ``re.DOTALL``
# lets ``.`` cross the newlines inside the block.
_FIGURE_BLOCK = re.compile(
r"<figure\b[^>]*>.*?</figure>",
re.DOTALL | re.IGNORECASE,
)
def _replace_one_match(
markdown: str,
pattern: re.Pattern[str],
descriptions: list[PictureDescription],
desc_idx: int,
) -> tuple[str, int]:
"""Replace the first occurrence of ``pattern`` with the next image block.
Returns the new markdown and the new ``desc_idx`` (advanced if a
replacement happened, unchanged otherwise).
"""
if desc_idx >= len(descriptions):
return markdown, desc_idx
match = pattern.search(markdown)
if not match:
return markdown, desc_idx
desc = descriptions[desc_idx]
captured_name: str | None = None
if match.groups():
captured_name = match.group(1)
name = captured_name or desc.name
block = _format_image_block(name, desc.description, desc.ocr_text)
new_markdown = markdown[: match.start()] + block + markdown[match.end():]
return new_markdown, desc_idx + 1
def _splice_after_figures(
markdown: str,
descriptions: list[PictureDescription],
desc_idx: int,
) -> tuple[str, int]:
"""Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
premium) wrap each figure / chart / inline table in this tag and
carry their own OCR of the figure's text content inside it. That
content is useful on its own, so we keep the original block
verbatim and add our vision-LLM block right after it -- giving
retrieval both signals in the same chunk.
Descriptions are matched to figures in document order (first
description -> first figure, etc.). All splice points are computed
upfront with :func:`re.finditer` and applied in REVERSE order so
earlier offsets stay valid as the markdown grows. Returns the
advanced ``desc_idx`` for the caller's leftover-handling.
"""
if desc_idx >= len(descriptions):
return markdown, desc_idx
matches = list(_FIGURE_BLOCK.finditer(markdown))
if not matches:
return markdown, desc_idx
n_to_splice = min(len(matches), len(descriptions) - desc_idx)
if n_to_splice <= 0:
return markdown, desc_idx
out = markdown
# Walk in reverse so each splice's end-offset still points at the
# right place in the (still-mutating) string.
for i in range(n_to_splice - 1, -1, -1):
match = matches[i]
desc = descriptions[desc_idx + i]
block = _format_image_block(desc.name, desc.description, desc.ocr_text)
out = out[: match.end()] + block + out[match.end():]
return out, desc_idx + n_to_splice
def inject_descriptions_inline(
markdown: str,
result: PictureExtractionResult,
) -> tuple[str, int]:
"""Splice per-image markdown blocks into the document at image positions.
Walks the markdown left-to-right, consuming descriptions in order.
Tries two splicing strategies, in this order:
1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
premium). The figure block carries the parser's own OCR of the
figure -- we preserve it and add our vision-LLM block right
after.
2. **Replace** for Docling-style markers, in priority order:
- ``<!-- image -->`` followed by ``Image: <filename>`` caption,
- ``<!-- image -->`` placeholder alone,
- bare ``Image: <filename>`` caption.
A document typically uses one style or the other (depending on
which parser produced its markdown), so the two paths don't fight
each other in practice. When they do co-occur, figures are
consumed first.
Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
that were placed inline. The caller decides what to do with any
leftover descriptions (typically: append them at the end).
"""
if not result.descriptions:
return markdown, 0
descriptions = result.descriptions
desc_idx = 0
out = markdown
# Step 1: layout-aware figures. One-shot batch -- finds ALL
# <figure> blocks, splices in document order until we exhaust
# either side.
out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
# Step 2: Docling-style replacement markers. One match per
# iteration, so a doc that has both a figure (consumed above) and
# a Docling placeholder (consumed below) still works.
while desc_idx < len(descriptions):
before_idx = desc_idx
out, desc_idx = _replace_one_match(
out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
out, desc_idx = _replace_one_match(
out, _PLACEHOLDER_ONLY, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
out, desc_idx = _replace_one_match(
out, _CAPTION_ONLY, descriptions, desc_idx
)
if desc_idx > before_idx:
continue
# No more positions to splice into.
break
return out, desc_idx
def render_appended_section(
descriptions: list[PictureDescription],
*,
skip_notes: PictureExtractionResult | None = None,
heading: str = "## Image Content (vision-LLM extracted)",
) -> str:
"""Render leftover descriptions as an appended section.
Used as a fallback when not every description could be inlined
(either because the parser produced no detectable image markers,
or because there were more extracted images than markers).
"""
if not descriptions and not skip_notes:
return ""
parts: list[str] = ["", heading, ""]
for desc in descriptions:
parts.append(
_format_image_block(desc.name, desc.description, desc.ocr_text)
)
parts.append("")
if skip_notes is not None:
notes: list[str] = []
if skip_notes.skipped_too_large:
notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
if skip_notes.skipped_too_small:
notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
if skip_notes.skipped_duplicate:
notes.append(f"{skip_notes.skipped_duplicate} duplicate")
if skip_notes.failed:
notes.append(f"{skip_notes.failed} failed")
if notes:
parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
return "\n".join(parts)
def merge_descriptions_into_markdown(
markdown: str,
result: PictureExtractionResult,
) -> str:
"""Top-level: inline what we can, append what's left over.
This is the function the ETL pipeline actually calls. It guarantees
that no successfully-described image is silently dropped: anything
we can't splice inline gets appended at the end with a heading
that makes it clear those came from the document but weren't
location-matched.
"""
if not result.descriptions:
return markdown
new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
leftover = result.descriptions[n_inlined:]
if not leftover:
return new_markdown
# Distinguish in the heading whether NONE were inlined (parser
# produced no markers at all) vs SOME (mismatched count).
heading = (
"## Image Content (vision-LLM extracted)"
if n_inlined == 0
else "## Image Content (additional, no inline marker found)"
)
section = render_appended_section(leftover, heading=heading)
if not section:
return new_markdown
return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
__all__ = [
"PictureDescription",
"PictureExtractionResult",
"describe_pictures",
"inject_descriptions_inline",
"merge_descriptions_into_markdown",
"render_appended_section",
]

View file

@ -77,10 +77,16 @@ class DoclingService:
# Create pipeline options with version-safe attribute checking
pipeline_options = PdfPipelineOptions()
# Disable OCR (user request)
# Enable OCR so text-in-image (chart axes, ECG annotations,
# lab tables embedded as images, scanned pages, etc.) is
# lifted into the main markdown stream. This pairs with the
# vision-LLM picture-description pass downstream — OCR
# captures literal text; vision LLM captures the visual
# content. Together they give a faithful representation of
# PDFs that mix text and images.
if hasattr(pipeline_options, "do_ocr"):
pipeline_options.do_ocr = False
logger.info("⚠️ OCR disabled by user request")
pipeline_options.do_ocr = True
logger.info("✅ OCR enabled for embedded text-in-image extraction")
else:
logger.warning("⚠️ OCR attribute not available in this Docling version")

View file

@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.etl_pipeline.file_classifier import (
FileCategory,
classify_file as etl_classify,
)
await _notify(ctx, "parsing", "Processing file")
await ctx.task_logger.log_task_progress(
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
{"processing_stage": "extracting"},
)
# Fetch the vision LLM whenever the operator opts in. The ETL
# pipeline decides what to do with it: image files run through the
# vision LLM directly; document files (PDFs) get per-image
# descriptions appended via picture_describer.
vision_llm = None
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
if ctx.use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
await _notify(ctx, "parsing", "Extracting content")
etl_result = await EtlPipelineService().extract(
# Document files (PDF, docx, etc.) get vision LLM treatment too:
# the ETL pipeline appends a per-image description section when
# vision_llm is provided. See picture_describer.describe_pictures.
vision_llm = None
if ctx.use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(
file_path=ctx.file_path,
filename=ctx.filename,
@ -418,8 +427,12 @@ async def _extract_file_content(
billable_pages = estimated_pages * mode.page_multiplier
await page_limit_service.check_page_limit(user_id, billable_pages)
# Vision LLM is provided to the ETL pipeline for any file category
# when the operator opts in. Image files run through it directly;
# document files (PDFs) get per-image descriptions appended via
# picture_describer.
vision_llm = None
if use_vision_llm and category == FileCategory.IMAGE:
if use_vision_llm:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)

View file

@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Document path with vision LLM: per-image descriptions are appended
# ---------------------------------------------------------------------------
def _fake_extraction_result(*descriptions):
from app.etl_pipeline.picture_describer import (
PictureDescription,
PictureExtractionResult,
)
return PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=d["page"],
ordinal_in_page=d.get("ordinal", 0),
name=d["name"],
sha256=d.get("sha", "deadbeef"),
description=d["desc"],
)
for d in descriptions
]
)
async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
"""A PDF with an `<!-- image -->` placeholder + caption gets the
block spliced inline (no orphaned ``## Image Content`` section).
This is the headline scenario for the medxpertqa benchmark: the
image content lives in the same chunk as the surrounding case text
so retrieval pulls the question, image, and answer options together.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": (
"# MedXpertQA-MM MM-130\n\n"
"## Clinical case\n\nA 44-year-old man...\n\n"
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
"## Answer choices\n\nA) ...\n"
)
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
extraction = _fake_extraction_result(
{
"page": 1,
"name": "Im0",
"desc": "Axial CT showing a large cystic mass.",
}
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=extraction),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
md = result.markdown_content
# The placeholder + caption are gone, replaced by a horizontal-
# rule-delimited section with the captioned filename.
assert "<!-- image -->" not in md
assert "Image: MM-130-a.jpeg" not in md
assert "**Embedded image:** `MM-130-a.jpeg`" in md
assert "**Visual description:**" in md
assert "Axial CT showing a large cystic mass." in md
# No OCR section -- our fake_extraction_result has no ocr_text,
# and the format omits the section when there's no text to show.
assert "**OCR text:**" not in md
# No raw HTML / XML tags or blockquote wrapping leak.
assert "<image" not in md
assert "> **Embedded image:**" not in md
# No appended section -- everything went inline.
assert "## Image Content" not in md
# Surrounding case text + answer options are preserved.
assert "A 44-year-old man..." in md
assert "## Answer choices" in md
assert "A) ..." in md
async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
"""When parser markdown has no image markers, descriptions get appended.
This is the fallback path for parsers that drop image placeholders
entirely. The image content still ends up in the markdown -- just
in a clearly-labeled section rather than inline.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
extraction = _fake_extraction_result(
{"page": 1, "name": "Im0", "desc": "An image description."}
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=extraction),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
md = result.markdown_content
assert "# Parsed PDF text" in md
assert "## Image Content (vision-LLM extracted)" in md
assert "**Embedded image:** `Im0`" in md
assert "An image description." in md
async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
tmp_path, mocker
):
"""No vision LLM -> parser markdown returned as-is."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
describe_mock = mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(),
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Parsed PDF text"
assert "<image" not in result.markdown_content
describe_mock.assert_not_called()
async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
tmp_path, mocker
):
"""A pypdf or vision LLM blow-up never fails the document upload."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Parsed PDF text"
assert result.etl_service == "DOCLING"
async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
tmp_path, mocker
):
"""Vision-LLM-enabled PDF with zero extracted images is unchanged."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Just text, no images"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
empty = _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=empty),
)
fake_llm = mocker.MagicMock()
result = await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Just text, no images"
assert "<image" not in result.markdown_content
# ---------------------------------------------------------------------------
# Per-image OCR runner: wiring + behaviour
#
# When extracting a PDF with a vision LLM, the ETL service must ALSO
# pass an ``ocr_runner`` to picture_describer. The runner is a closure
# that re-feeds each extracted image through a vision-LLM-less
# EtlPipelineService -- i.e. the same OCR engine that handles
# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
# at each embedded image, with the text attached to the inline block.
# ---------------------------------------------------------------------------
async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
tmp_path, mocker
):
"""The ETL service must wire an ocr_runner kwarg to describe_pictures."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
describe_mock = mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=mocker.AsyncMock(return_value=_fake_extraction_result()),
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
describe_mock.assert_awaited_once()
_, kwargs = describe_mock.await_args
assert "ocr_runner" in kwargs
assert callable(kwargs["ocr_runner"])
async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
tmp_path, mocker
):
"""The OCR runner closure should re-extract each image via the parser.
We capture the runner that the ETL service passes to
describe_pictures, invoke it with a fake image path, and assert
that Docling was called with that image. This proves the closure
is wired to a vision-LLM-less sub-pipeline (otherwise it would
recurse into the vision LLM and never hit the OCR engine).
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
image_file = tmp_path / "Im0.png"
image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {
"content": "Slice 24 / 60 L R"
}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
captured: dict = {}
async def capture_runner(*args, **kwargs):
captured["runner"] = kwargs["ocr_runner"]
return _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=capture_runner,
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
runner = captured["runner"]
ocr_text = await runner(str(image_file), "Im0.png")
assert ocr_text == "Slice 24 / 60 L R"
# Docling was invoked twice in total: once for the PDF, once for
# the image we re-fed via the runner.
assert fake_docling.process_document.await_count == 2
async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
tmp_path, mocker
):
"""Unsupported image format → runner returns empty string, doesn't raise.
Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
Docling can't load. We don't want an unsupported format on ONE
embedded image to spoil the whole PDF extraction; the runner
should swallow the EtlUnsupportedFileError and return "" so the
image gets a description but no OCR tag.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content")
weird_image = tmp_path / "Im0.jp2" # JPEG2000, unlikely to be supported
weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
captured: dict = {}
async def capture_runner(*args, **kwargs):
captured["runner"] = kwargs["ocr_runner"]
return _fake_extraction_result()
mocker.patch(
"app.etl_pipeline.picture_describer.describe_pictures",
new=capture_runner,
)
fake_llm = mocker.MagicMock()
await EtlPipelineService(vision_llm=fake_llm).extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
runner = captured["runner"]
ocr_text = await runner(str(weird_image), "Im0.jp2")
assert ocr_text == ""
# ---------------------------------------------------------------------------
# Processing Mode enum tests
# ---------------------------------------------------------------------------

View file

@ -0,0 +1,967 @@
"""Unit tests for the picture_describer module.
Covers:
- :func:`describe_pictures` -- the PDF image walker + per-image vision
LLM call (structured output split into ``ocr_text`` and
``description``);
- :func:`inject_descriptions_inline` -- in-place replacement of image
placeholders / captions in the parser markdown;
- :func:`merge_descriptions_into_markdown` -- the top-level helper
that inlines what it can and appends what it can't;
- :func:`render_appended_section` -- the appended-fallback renderer.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.etl_pipeline.picture_describer import (
PictureDescription,
PictureExtractionResult,
describe_pictures,
inject_descriptions_inline,
merge_descriptions_into_markdown,
render_appended_section,
)
pytestmark = pytest.mark.unit
def _make_image_obj(name: str, data: bytes):
"""Mimic pypdf's ImageFile object shape for the bits we use."""
img = MagicMock()
img.name = name
img.data = data
return img
# ---------------------------------------------------------------------------
# describe_pictures: short-circuits
# ---------------------------------------------------------------------------
async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
"""Non-PDF files are silently no-op'd; we don't try to extract images."""
docx_file = tmp_path / "report.docx"
docx_file.write_bytes(b"PK fake docx")
fake_llm = AsyncMock()
result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
assert result.descriptions == []
assert result.skipped_too_large == 0
fake_llm.ainvoke.assert_not_called()
async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
"""If the caller didn't provide a vision LLM, we no-op even for PDFs."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
result = await describe_pictures(str(pdf_file), "report.pdf", None)
assert result.descriptions == []
async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
"""A PDF that pypdf can open but contains zero images returns empty."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
fake_llm = AsyncMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert result.descriptions == []
fake_llm.ainvoke.assert_not_called()
# ---------------------------------------------------------------------------
# describe_pictures: happy paths
# ---------------------------------------------------------------------------
async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
"""Every eligible image gets exactly one description-only vision call."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
page1 = MagicMock(images=[img_a])
page2 = MagicMock(images=[img_b])
fake_reader = MagicMock()
fake_reader.pages = [page1, page2]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=["Description A", "Description B"]),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 2
by_name = {d.name: d.description for d in result.descriptions}
assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
assert all(d.page_number in (1, 2) for d in result.descriptions)
assert parse_mock.await_count == 2
async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
"""An image that appears N times in the PDF is described once."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
img = _make_image_obj("logo.png", payload)
page1 = MagicMock(images=[img])
page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
fake_reader = MagicMock()
fake_reader.pages = [page1, page2, page3]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="Logo desc"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.skipped_duplicate == 2
assert parse_mock.await_count == 1
async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
"""Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
page = MagicMock(images=[tiny, big])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="CT scan"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].name == "ct.jpeg"
assert result.skipped_too_small == 1
assert parse_mock.await_count == 1
async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
"""Images larger than the vision LLM's per-image cap are skipped."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
page = MagicMock(images=[huge, ok])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
parse_mock = mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="OK image"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].name == "ok.jpeg"
assert result.skipped_too_large == 1
assert parse_mock.await_count == 1
async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
"""A vision LLM failure on one image must not kill the whole document."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
page = MagicMock(images=[img_a, img_b])
fake_reader = MagicMock()
fake_reader.pages = [page]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].description == "Success"
assert result.failed == 1
async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
"""A malformed PDF that pypdf can't open returns an empty result."""
pdf_file = tmp_path / "broken.pdf"
pdf_file.write_bytes(b"not a pdf")
mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
assert result.descriptions == []
# ---------------------------------------------------------------------------
# inject_descriptions_inline: replacement patterns
# ---------------------------------------------------------------------------
def _desc(name="Im0", description="A CT scan."):
return PictureDescription(
page_number=1,
ordinal_in_page=0,
name=name,
sha256="aa",
description=description,
)
def test_inject_no_op_when_no_descriptions():
markdown = "# Title\n\nbody text\n"
result = PictureExtractionResult()
out, n = inject_descriptions_inline(markdown, result)
assert out == markdown
assert n == 0
def test_inject_replaces_placeholder_with_caption():
"""`<!-- image -->` + `Image: <name>` together becomes one block.
This is the most common medxpertqa case: our renderer puts a caption
line right below the embedded JPEG, and Docling preserves both.
"""
markdown = (
"# Case\n\n"
"Clinical text...\n\n"
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
"Answer choices: A) ...\n"
)
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "<!-- image -->" not in out
assert "Image: MM-130-a.jpeg" not in out # caption consumed
# New format: horizontal-rule-delimited section with "Embedded
# image:" anchor and named "Visual description:" section. No
# blockquote wrapping -- nested blocks (lists, code, tables) inside
# a blockquote are silently dropped by Streamdown / remark.
assert "**Embedded image:** `MM-130-a.jpeg`" in out
assert "**Visual description:**" in out
assert "A CT scan." in out
# Block is delimited by horizontal rules so it stands out from
# surrounding paragraphs.
assert "\n---\n" in out
# No OCR section -- this fixture has no ocr_text on its descriptions.
assert "**OCR text:**" not in out
# No raw HTML tags / blockquote prefixes leak.
assert "<image" not in out
assert "</image>" not in out
assert "> **Embedded image:**" not in out # we no longer wrap in `>`
# Surrounding context is preserved.
assert "Clinical text..." in out
assert "Answer choices: A) ..." in out
def test_inject_uses_pypdf_name_when_no_caption():
"""`<!-- image -->` alone uses the pypdf-given name as the attribute."""
markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `Im0`" in out
def test_inject_replaces_bare_caption():
"""A bare `Image: <name>` line (no placeholder) still gets replaced."""
markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `scan.jpeg`" in out
assert "Image: scan.jpeg" not in out
def test_inject_handles_multiple_images_in_order():
"""Two placeholders + two descriptions: each consumed in document order."""
markdown = (
"Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
"Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
)
result = PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
description="Desc A",
),
PictureDescription(
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
description="Desc B",
),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
assert "**Embedded image:** `a.jpeg`" in out
assert "**Embedded image:** `b.jpeg`" in out
assert out.index("a.jpeg") < out.index("b.jpeg")
assert "Desc A" in out and "Desc B" in out
def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
"""Three descriptions, one marker -> only one inlined, two leftover."""
markdown = "Just one <!-- image --> here.\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First"),
_desc(name="Im1", description="Second"),
_desc(name="Im2", description="Third"),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `Im0`" in out
assert "**Embedded image:** `Im1`" not in out
def test_inject_returns_zero_when_no_markers_present():
"""Markdown with no image markers at all returns the input unchanged."""
markdown = "# Title\n\nJust text. No images mentioned at all.\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out, n = inject_descriptions_inline(markdown, result)
assert n == 0
assert out == markdown
# ---------------------------------------------------------------------------
# render_appended_section
# ---------------------------------------------------------------------------
def test_render_appended_empty_when_nothing_passed():
assert render_appended_section([]) == ""
def test_render_appended_renders_each_image_as_block():
descriptions = [
_desc(name="MM-130-a.jpeg", description="CT scan"),
_desc(name="MM-130-b.jpeg", description="Bar chart"),
]
rendered = render_appended_section(descriptions)
assert "## Image Content (vision-LLM extracted)" in rendered
assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
assert "CT scan" in rendered
assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
assert "Bar chart" in rendered
# Each image block is delimited by horizontal rules.
assert rendered.count("\n---\n") >= 2
# No raw HTML / XML / blockquote prefixes.
assert "<image" not in rendered
assert "> **Embedded image:**" not in rendered
assert "**OCR text:**" not in rendered
def test_render_appended_includes_skip_notes():
descriptions = [_desc()]
skip_result = PictureExtractionResult(
descriptions=descriptions,
skipped_too_small=2,
skipped_too_large=1,
skipped_duplicate=3,
failed=1,
)
rendered = render_appended_section(descriptions, skip_notes=skip_result)
assert "_Note:" in rendered
assert "2 too small" in rendered
assert "1 too large" in rendered
assert "3 duplicate" in rendered
assert "1 failed" in rendered
# ---------------------------------------------------------------------------
# merge_descriptions_into_markdown: top-level
# ---------------------------------------------------------------------------
def test_merge_inlines_when_marker_present():
markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
out = merge_descriptions_into_markdown(markdown, result)
assert "**Embedded image:** `scan.jpeg`" in out
# Nothing leaked into an appended section -- we should NOT see the
# appended-section heading because everything went inline.
assert "## Image Content" not in out
def test_merge_appends_when_no_marker_present():
"""Zero markers means everything goes into an appended section."""
markdown = "Pure text doc, no image markers.\n"
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="An image desc.")]
)
out = merge_descriptions_into_markdown(markdown, result)
assert "Pure text doc" in out
assert "## Image Content (vision-LLM extracted)" in out
assert "**Embedded image:** `Im0`" in out
def test_merge_appends_leftovers_with_distinct_heading():
"""One marker, two descriptions -> one inline, second appended under
a heading that signals it's a leftover.
"""
markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First"),
_desc(name="Im1", description="Second"),
]
)
out = merge_descriptions_into_markdown(markdown, result)
assert "**Embedded image:** `a.jpeg`" in out # inlined
assert "## Image Content (additional, no inline marker found)" in out
assert "**Embedded image:** `Im1`" in out # appended
# ---------------------------------------------------------------------------
# describe_pictures: ocr_runner integration
#
# These tests cover the per-image OCR side-channel: when the caller
# supplies an ``ocr_runner`` callable, each extracted image is sent
# both to the vision LLM (visual description) and to the OCR runner
# (text-in-image), in parallel. The OCR text -- if any -- is recorded
# on the PictureDescription and rendered in the inline block.
# ---------------------------------------------------------------------------
async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
"""When an ocr_runner is provided, it's invoked once per eligible image."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img_a, img_b])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=["Visual A", "Visual B"]),
)
ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert ocr_runner.await_count == 2
by_name = {d.name: d.ocr_text for d in result.descriptions}
assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
tmp_path, mocker
):
"""Vision LLM and OCR run concurrently per image, not sequentially.
We verify this by recording call timestamps: if both finish within
a small window relative to the per-call sleep, they ran in parallel.
"""
import asyncio
import time
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
sleep_each = 0.05 # 50ms per call
async def slow_vision(*args, **kwargs):
await asyncio.sleep(sleep_each)
return "Visual"
async def slow_ocr(*args, **kwargs):
await asyncio.sleep(sleep_each)
return "OCR"
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=slow_vision,
)
fake_llm = MagicMock()
started = time.perf_counter()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
)
elapsed = time.perf_counter() - started
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text == "OCR"
# Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
# Be generous with the bound so we're not flaky on slow CI.
assert elapsed < 1.5 * sleep_each, (
f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
)
async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
"""Empty / whitespace-only OCR result is normalised to None.
This means the rendered image block won't carry an empty
"OCR text" section for images that contain no text at all
(e.g. a clean radiograph).
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="A radiograph."),
)
ocr_runner = AsyncMock(return_value=" \n \n")
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text is None
async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
"""An OCR runner exception must not kill the description for that image.
OCR is supplementary; the vision LLM's description is the primary
payload. If OCR blows up we drop the OCR field for that image and
keep the description.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="A radiograph."),
)
ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert len(result.descriptions) == 1
assert result.descriptions[0].description == "A radiograph."
assert result.descriptions[0].ocr_text is None
assert result.failed == 0 # the IMAGE didn't fail; only its OCR did
async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
tmp_path, mocker
):
"""If the vision LLM fails, the image is skipped even if OCR succeeded.
The inline block's primary purpose is the visual description; an
OCR-only block would be misleading (it'd look like the vision
pipeline ran when it didn't), so we treat vision failure as image
failure regardless of OCR outcome.
"""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(side_effect=RuntimeError("vision blew up")),
)
ocr_runner = AsyncMock(return_value="OCR text")
fake_llm = MagicMock()
result = await describe_pictures(
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
)
assert result.descriptions == []
assert result.failed == 1
async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
tmp_path, mocker
):
"""Backward compat: omitting ocr_runner produces description-only blocks."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
fake_reader = MagicMock()
fake_reader.pages = [MagicMock(images=[img])]
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
mocker.patch(
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
new=AsyncMock(return_value="Visual"),
)
fake_llm = MagicMock()
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
assert len(result.descriptions) == 1
assert result.descriptions[0].ocr_text is None
# ---------------------------------------------------------------------------
# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
# ---------------------------------------------------------------------------
def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L R 10mm"):
return PictureDescription(
page_number=1,
ordinal_in_page=0,
name=name,
sha256="aa",
description=description,
ocr_text=ocr_text,
)
def test_inject_renders_ocr_section_when_ocr_text_present():
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
result = PictureExtractionResult(
descriptions=[_desc_with_ocr(name="Im0", ocr_text="L R 10mm")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "**Embedded image:** `scan.jpeg`" in out
assert "**OCR text:**" in out
assert "L R 10mm" in out
# OCR section comes before the visual description (literal text
# first, interpretation second).
assert out.index("**OCR text:**") < out.index("**Visual description:**")
# Critical: no nested-block constructs (fenced code, blockquote)
# that previous formats relied on -- both broke in Streamdown /
# PlateJS by escaping their container and dropping content.
assert "```" not in out
assert "> **" not in out
def test_inject_renders_multiline_ocr_with_hard_breaks():
"""Multi-line OCR uses trailing-two-spaces hard breaks so each
line renders on its own row, without needing a fragile fenced
code block or blockquote wrapper."""
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
result = PictureExtractionResult(
descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
)
out, _ = inject_descriptions_inline(markdown, result)
# Every OCR line is present.
for line in ("Slice 24 / 60", "L", "R", "10 mm"):
assert line in out
# Non-last OCR lines get the trailing two-space hard break.
assert "Slice 24 / 60 \n" in out
assert "\nL \n" in out
assert "\nR \n" in out
# Last OCR line must NOT carry the two-space hard break (no stray <br>).
assert "10 mm \n" not in out
assert "10 mm\n" in out
def test_render_appended_renders_ocr_section_when_ocr_text_present():
descriptions = [
_desc_with_ocr(
name="MM-130-a.jpeg",
description="Axial CT.",
ocr_text="Slice 24 / 60",
),
]
rendered = render_appended_section(descriptions)
assert "**OCR text:**" in rendered
assert "Slice 24 / 60" in rendered
assert "Axial CT." in rendered
def test_render_omits_ocr_section_when_ocr_text_is_none():
descriptions = [_desc(name="Im0", description="A clean radiograph.")]
rendered = render_appended_section(descriptions)
assert "**Embedded image:** `Im0`" in rendered
assert "**OCR text:**" not in rendered
assert "**Visual description:**" in rendered
# No raw HTML / blockquote prefixes.
assert "<image" not in rendered
assert "> **" not in rendered
# ---------------------------------------------------------------------------
# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
#
# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
# premium both emit ``<figure>...</figure>`` blocks that already contain
# the parser's own OCR of the figure (chart bar values, axis labels,
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
# That parser-side content is useful for retrieval on its own, so we
# PRESERVE the figure verbatim and append our vision-LLM block
# immediately after rather than substituting for it.
# ---------------------------------------------------------------------------
def test_inject_appends_block_after_figure_preserving_parser_content():
"""Figure block stays intact; vision-LLM block goes right after it."""
markdown = (
"Some narrative text.\n\n"
"<figure>\n\n"
"Republican\n68\nDemocrat\n30\n"
"\n</figure>\n\n"
"Following paragraph.\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# Original figure is preserved verbatim -- the parser's OCR'd
# numbers must still be searchable.
assert "<figure>" in out
assert "</figure>" in out
assert "Republican" in out and "68" in out
# Our vision-LLM block follows the figure, not before / inside it.
assert "**Embedded image:** `Im0`" in out
assert "Bar chart of party ID." in out
figure_close = out.index("</figure>")
embedded_at = out.index("**Embedded image:** `Im0`")
assert figure_close < embedded_at, "block must be appended AFTER </figure>"
# Surrounding narrative is preserved.
assert "Some narrative text." in out
assert "Following paragraph." in out
def test_inject_handles_multiple_figures_in_document_order():
"""N figures + N descriptions: each pair lands in the right place."""
markdown = (
"Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
"Between\n\n<figure>\nChart B bars\n</figure>\n\n"
"End.\n"
)
result = PictureExtractionResult(
descriptions=[
PictureDescription(
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
description="Description of chart A.",
),
PictureDescription(
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
description="Description of chart B.",
),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
# Both figures preserved; both descriptions inlined; order matches.
assert out.count("<figure>") == 2
assert out.count("</figure>") == 2
assert "Description of chart A." in out
assert "Description of chart B." in out
assert out.index("Description of chart A.") < out.index(
"Description of chart B."
)
# Each description appears AFTER its corresponding </figure>.
first_close = out.index("</figure>")
assert first_close < out.index("Description of chart A.")
second_close = out.index("</figure>", first_close + 1)
assert second_close < out.index("Description of chart B.")
def test_inject_figures_with_attributes_and_nested_tags():
"""``<figure>`` with attributes and nested tags is matched and preserved."""
markdown = (
'<figure id="fig-3" class="chart">\n'
'<figcaption>Source: Pew Research</figcaption>\n'
"<table><tr><td>Republican</td><td>57</td></tr></table>\n"
"</figure>\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Survey table.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# All nested HTML is preserved (chunking will pick it up).
assert 'id="fig-3"' in out
assert "<figcaption>Source: Pew Research</figcaption>" in out
assert "<table>" in out and "Republican" in out and "57" in out
# Our block sits after the closing tag.
assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
def test_inject_figures_more_descriptions_than_figures_returns_remaining():
"""Three descriptions, one figure -> one inlined, two left for caller."""
markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="First desc."),
_desc(name="Im1", description="Second desc."),
_desc(name="Im2", description="Third desc."),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
assert "First desc." in out
# Leftovers are the caller's job; inject_descriptions_inline does
# not append them on its own.
assert "Second desc." not in out
assert "Third desc." not in out
def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
"""Two figures, one description -> first figure enriched, second left raw."""
markdown = (
"<figure>\nfigure 1 content\n</figure>\n"
"<figure>\nfigure 2 content\n</figure>\n"
)
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Only description.")]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 1
# Both figures still present; only the first one was enriched.
assert out.count("<figure>") == 2
assert "Only description." in out
# Second figure has no embedded-image block immediately after it.
second_open = out.index("<figure>", out.index("<figure>") + 1)
second_close = out.index("</figure>", second_open)
after_second = out[second_close:]
assert "**Embedded image:**" not in after_second
def test_merge_inlines_at_figure_boundary():
"""Top-level helper does the right thing with figures (no leftover section)."""
markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
result = PictureExtractionResult(
descriptions=[_desc(name="Im0", description="Bar chart.")]
)
out = merge_descriptions_into_markdown(markdown, result)
# Inline succeeded -> no appended-section heading.
assert "## Image Content" not in out
assert "Bar chart." in out
assert "<figure>" in out and "</figure>" in out
def test_inject_figures_then_falls_through_to_docling_marker():
"""Mixed-marker doc: figure consumed first, then Docling placeholder.
Defensive -- single docs are usually one parser's output, but if a
pipeline ever stitches two parsers' markdowns together the inliner
should still place each description.
"""
markdown = (
"<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
"Later in the doc:\n\n"
"<!-- image -->\nImage: scan.jpeg\n\n"
"End.\n"
)
result = PictureExtractionResult(
descriptions=[
_desc(name="Im0", description="Chart description."),
_desc(name="Im1", description="Scan description."),
]
)
out, n = inject_descriptions_inline(markdown, result)
assert n == 2
# Figure preserved + augmented.
assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
assert "Chart description." in out
# Docling placeholder + caption replaced.
assert "<!-- image -->" not in out
assert "Image: scan.jpeg" not in out
assert "**Embedded image:** `scan.jpeg`" in out
assert "Scan description." in out

View file

@ -0,0 +1,146 @@
"""Unit tests for the vision_llm parser helpers.
Two helpers exist:
- :func:`parse_with_vision_llm` -- single-shot for standalone image
uploads (.png/.jpg/etc). Returns combined markdown (description +
verbatim OCR mixed) since the image *is* the document.
- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
visual description only; OCR is the ETL service's job.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# parse_with_vision_llm: legacy single-shot path
# ---------------------------------------------------------------------------
async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
"""Standalone image uploads still go through the combined-markdown path."""
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "# A scan of something."
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
assert out == "# A scan of something."
fake_llm.ainvoke.assert_awaited_once()
async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
"""An empty model response raises rather than silently returning blanks."""
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = ""
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
with pytest.raises(ValueError, match="empty content"):
await parse_with_vision_llm(str(img), "scan.png", fake_llm)
# ---------------------------------------------------------------------------
# parse_image_for_description: per-image-in-PDF, description only
# ---------------------------------------------------------------------------
async def test_parse_image_for_description_returns_description(tmp_path):
"""Description-only path returns the model's markdown unchanged."""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "Axial CT showing a large cystic mass."
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
out = await parse_image_for_description(str(img), "scan.png", fake_llm)
assert out == "Axial CT showing a large cystic mass."
async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
"""The prompt explicitly tells the model NOT to transcribe text.
This is the contract that lets us drop OCR from the response: the
ETL pipeline already has the text (from page-level OCR), so asking
the vision LLM for it would be redundant cost.
"""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = "A description"
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
await parse_image_for_description(str(img), "scan.png", fake_llm)
# The prompt is the first text part of the message we sent.
sent_messages = fake_llm.ainvoke.call_args.args[0]
prompt_text = sent_messages[0].content[0]["text"].lower()
assert "describe what this image visually depicts" in prompt_text
assert "do not transcribe text" in prompt_text
async def test_parse_image_for_description_rejects_empty(tmp_path):
"""Empty response surfaces as ValueError so the caller can skip the image."""
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
img = tmp_path / "scan.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
fake_response = MagicMock()
fake_response.content = " " # whitespace-only counts as empty
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
with pytest.raises(ValueError, match="empty content"):
await parse_image_for_description(str(img), "scan.png", fake_llm)
# ---------------------------------------------------------------------------
# Image size + extension validation (shared by both paths)
# ---------------------------------------------------------------------------
def test_image_to_data_url_rejects_oversized(tmp_path):
"""Images larger than 5 MB raise before any LLM call is made."""
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
big = tmp_path / "huge.png"
big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
with pytest.raises(ValueError, match="Image too large"):
_image_to_data_url(str(big))
def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
"""Unknown extensions raise rather than guessing a MIME type."""
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
weird = tmp_path / "scan.xyz"
weird.write_bytes(b"\x00" * 100)
with pytest.raises(ValueError, match="Unsupported image extension"):
_image_to_data_url(str(weird))