mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
|
|
@ -134,12 +134,92 @@ class EtlPipelineService:
|
|||
else:
|
||||
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
||||
|
||||
# When the operator opts into vision-LLM at ingest, walk the
|
||||
# original file's embedded images and append a structured
|
||||
# "Image Content" section. The parser's own OCR (Docling
|
||||
# do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
|
||||
# image; this side handles the *visual* description which the
|
||||
# parsers all drop today.
|
||||
content = await self._maybe_append_picture_descriptions(request, content)
|
||||
|
||||
return EtlResult(
|
||||
markdown_content=content,
|
||||
etl_service=etl_service,
|
||||
content_type="document",
|
||||
)
|
||||
|
||||
async def _maybe_append_picture_descriptions(
|
||||
self, request: EtlRequest, markdown: str
|
||||
) -> str:
|
||||
if self._vision_llm is None:
|
||||
return markdown
|
||||
|
||||
from app.etl_pipeline.picture_describer import (
|
||||
describe_pictures,
|
||||
merge_descriptions_into_markdown,
|
||||
)
|
||||
|
||||
# Per-image OCR runner: re-feed each extracted image through
|
||||
# the ETL pipeline *as a standalone image* (no vision LLM, so
|
||||
# the IMAGE branch falls through to the document parser, which
|
||||
# OCRs the image with the configured backend -- Docling /
|
||||
# Azure DI / LlamaCloud). This gives us per-image OCR text
|
||||
# attached to the inline image block, in addition to the
|
||||
# page-level OCR that the parser already merges into the main
|
||||
# markdown stream. The fresh sub-service gets vision_llm=None
|
||||
# so this call cannot recurse back into picture_describer.
|
||||
async def _ocr_image(image_path: str, image_name: str) -> str:
|
||||
try:
|
||||
sub = EtlPipelineService(vision_llm=None)
|
||||
ocr_result = await sub.extract(
|
||||
EtlRequest(file_path=image_path, filename=image_name)
|
||||
)
|
||||
except (
|
||||
EtlUnsupportedFileError,
|
||||
EtlServiceUnavailableError,
|
||||
) as exc:
|
||||
# Common case: the configured ETL service can't OCR
|
||||
# this image format (or no service is configured at
|
||||
# all). Don't spam warnings -- just no OCR for it.
|
||||
logging.debug(
|
||||
"Skipping per-image OCR for %s: %s", image_name, exc
|
||||
)
|
||||
return ""
|
||||
return ocr_result.markdown_content
|
||||
|
||||
try:
|
||||
result = await describe_pictures(
|
||||
request.file_path,
|
||||
request.filename,
|
||||
self._vision_llm,
|
||||
ocr_runner=_ocr_image,
|
||||
)
|
||||
except Exception:
|
||||
# Picture description is additive; never let it fail an
|
||||
# otherwise-successful document extraction.
|
||||
logging.warning(
|
||||
"Picture description failed for %s, returning parser output unchanged",
|
||||
request.filename,
|
||||
exc_info=True,
|
||||
)
|
||||
return markdown
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown
|
||||
|
||||
merged = merge_descriptions_into_markdown(markdown, result)
|
||||
logging.info(
|
||||
"Vision LLM described %d image(s) in %s "
|
||||
"(skipped: %d small / %d large / %d duplicate, %d failed)",
|
||||
len(result.descriptions),
|
||||
request.filename,
|
||||
result.skipped_too_small,
|
||||
result.skipped_too_large,
|
||||
result.skipped_duplicate,
|
||||
result.failed,
|
||||
)
|
||||
return merged
|
||||
|
||||
async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
|
||||
"""Try Azure Document Intelligence first (when configured) then LlamaCloud.
|
||||
|
||||
|
|
|
|||
|
|
@ -4,12 +4,34 @@ import os
|
|||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
|
||||
# A standalone image IS the document, so we want everything: visual
|
||||
# content plus any text the model can read off it. The output is
|
||||
# combined markdown that the chunker treats as the full document body.
|
||||
_PROMPT = (
|
||||
"Describe this image in markdown. "
|
||||
"Transcribe any visible text verbatim. "
|
||||
"Be concise but complete — let the image content guide the level of detail."
|
||||
)
|
||||
|
||||
# Per-image-in-PDF prompt. Here the image is *inside* a larger
|
||||
# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
|
||||
# already running OCR over the whole page — including text rendered
|
||||
# into images. So we explicitly tell the model NOT to transcribe text
|
||||
# and to focus only on visual interpretation. This avoids paying
|
||||
# output tokens for OCR content the ETL pipeline already captured.
|
||||
_DESCRIPTION_PROMPT = (
|
||||
"Describe what this image visually depicts in concise markdown. "
|
||||
"Focus on visual content — anatomy, structures, charts, diagrams, "
|
||||
"spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
|
||||
"histology slide), and any clinically or structurally relevant "
|
||||
"findings.\n\n"
|
||||
"Do NOT transcribe text from the image. Any text in the image "
|
||||
"(axis labels, annotations, scale bars, lab values, etc.) is "
|
||||
"already extracted by a separate OCR pipeline; duplicating it "
|
||||
"here would be redundant. Stick to the visual interpretation."
|
||||
)
|
||||
|
||||
_MAX_IMAGE_BYTES = (
|
||||
5 * 1024 * 1024
|
||||
) # 5 MB (Anthropic Claude's limit, the most restrictive)
|
||||
|
|
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
|
|||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
data_url = _image_to_data_url(file_path)
|
||||
async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": _PROMPT},
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
]
|
||||
)
|
||||
|
|
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
|||
if not text or not text.strip():
|
||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||
return text.strip()
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
"""Single-shot: returns combined markdown for a standalone image upload.
|
||||
|
||||
Used when the operator uploads an image file directly (jpg/png/etc).
|
||||
The image is the document, so the prompt asks for both visual
|
||||
description and verbatim text in one go.
|
||||
"""
|
||||
data_url = _image_to_data_url(file_path)
|
||||
return await _invoke_vision(llm, _PROMPT, data_url, filename)
|
||||
|
||||
|
||||
async def parse_image_for_description(
|
||||
file_path: str, filename: str, llm
|
||||
) -> str:
|
||||
"""Visual-description-only call for per-image-in-PDF use.
|
||||
|
||||
Used by ``picture_describer`` when an image is embedded inside a
|
||||
larger document. Returns a markdown description of what the image
|
||||
visually depicts; deliberately does NOT include text-in-image OCR
|
||||
because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
|
||||
already running OCR over the entire page and would duplicate that
|
||||
text content.
|
||||
"""
|
||||
data_url = _image_to_data_url(file_path)
|
||||
return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"parse_image_for_description",
|
||||
"parse_with_vision_llm",
|
||||
]
|
||||
|
|
|
|||
678
surfsense_backend/app/etl_pipeline/picture_describer.py
Normal file
678
surfsense_backend/app/etl_pipeline/picture_describer.py
Normal file
|
|
@ -0,0 +1,678 @@
|
|||
"""Extract embedded images from PDFs, describe them, and inject the
|
||||
descriptions inline into the parser's markdown.
|
||||
|
||||
When the operator passes ``use_vision_llm=True`` for a PDF, the document
|
||||
parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
|
||||
but mostly drop the actual image content -- a CT scan inside a clinical
|
||||
PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
|
||||
and the caption text below it.
|
||||
|
||||
This module fills that gap. After the document parser produces markdown
|
||||
text, we:
|
||||
|
||||
1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
|
||||
image (deduped by sha256, size-capped to match the vision LLM's own
|
||||
limits).
|
||||
2. Run the vision LLM on each unique image (visual description) and,
|
||||
in parallel when an OCR runner is provided, re-feed the same image
|
||||
through the ETL service for per-image OCR.
|
||||
3. **Inject** a horizontal-rule-delimited markdown section -- with
|
||||
named "OCR text" and "Visual description" sub-sections -- where the
|
||||
image actually appears in the parser's markdown. Two splice modes,
|
||||
chosen by which marker the parser emitted:
|
||||
|
||||
- **Replace** Docling-style ``<!-- image -->`` placeholders (and an
|
||||
optional ``Image: <filename>`` caption line). The placeholder
|
||||
carries no useful content of its own, so we substitute our block
|
||||
for it.
|
||||
- **Append after** layout-aware ``<figure>...</figure>`` blocks
|
||||
(Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
|
||||
already contain parser-extracted chart values / OCR'd labels /
|
||||
captions, which are themselves useful for retrieval -- so we
|
||||
PRESERVE the figure verbatim and add our vision-LLM block
|
||||
immediately after it. The chunk then contains both the parser's
|
||||
structured numbers AND the VLM's semantic interpretation.
|
||||
|
||||
Either way, the image content stays in context with the surrounding
|
||||
document body rather than getting orphaned at the end -- crucial for
|
||||
retrieval, where a single chunk should contain the question, the
|
||||
image content, and the answer options together.
|
||||
|
||||
If no placeholders, figures, or captions can be matched (e.g. an
|
||||
unusual parser output), we fall back to appending an
|
||||
``## Image Content`` section so no image content is silently lost.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Type alias for the OCR callback. Takes (file_path, filename), returns
|
||||
# the OCR'd markdown text -- or empty string if no text was found, or
|
||||
# raises if OCR failed unrecoverably (which the describer catches and
|
||||
# treats as "no OCR for this image" rather than failing the whole doc).
|
||||
OcrRunner = Callable[[str, str], Awaitable[str]]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Bound how many vision LLM calls we make in parallel for a single
|
||||
# document. Vision models are typically rate-limited; 4 concurrent
|
||||
# calls is a safe default that respects most provider limits while
|
||||
# keeping wall-clock manageable for image-heavy PDFs.
|
||||
_VISION_CONCURRENCY = 4
|
||||
|
||||
# Match parse_with_vision_llm's per-image cap so we don't even attempt
|
||||
# images that the vision LLM would reject anyway (Anthropic's 5 MB
|
||||
# limit is the most restrictive among the major providers).
|
||||
_MAX_IMAGE_BYTES = 5 * 1024 * 1024
|
||||
|
||||
# Skip degenerate images: tracking pixels, very small decorative dots,
|
||||
# scanner-introduced artefacts. We can't cheaply check pixel dimensions
|
||||
# without decoding the image, so we approximate: anything under 1 KB is
|
||||
# almost certainly not informative content.
|
||||
_MIN_IMAGE_BYTES = 1024
|
||||
|
||||
|
||||
@dataclass
|
||||
class PictureDescription:
|
||||
"""A single extracted image with its visual description and (optionally) OCR.
|
||||
|
||||
Two content fields by design, each produced by the *right* tool:
|
||||
|
||||
- ``description``: the vision LLM's visual interpretation. What the
|
||||
image depicts (anatomy, charts, layout, etc.) -- the semantic
|
||||
content that only a vision model can produce.
|
||||
- ``ocr_text``: text-in-image extracted by re-feeding the image
|
||||
through the configured ETL service (Docling/Azure DI/LlamaCloud)
|
||||
*as if it were a standalone image upload*. Specialist OCR engine,
|
||||
per-image attribution, no vision LLM tokens spent on text. None
|
||||
when no OCR was requested or OCR found no text.
|
||||
"""
|
||||
|
||||
page_number: int # 1-indexed
|
||||
ordinal_in_page: int # 0-indexed within the page
|
||||
name: str # name pypdf assigned (e.g. "Im0")
|
||||
sha256: str # hash of the raw image bytes
|
||||
description: str # visual description (markdown)
|
||||
ocr_text: str | None = None # OCR text from the ETL service, if any
|
||||
|
||||
|
||||
@dataclass
|
||||
class PictureExtractionResult:
|
||||
"""Aggregate result of extracting all pictures from a document."""
|
||||
|
||||
descriptions: list[PictureDescription] = field(default_factory=list)
|
||||
skipped_too_small: int = 0
|
||||
skipped_too_large: int = 0
|
||||
skipped_duplicate: int = 0
|
||||
failed: int = 0
|
||||
|
||||
@property
|
||||
def has_content(self) -> bool:
|
||||
return bool(self.descriptions)
|
||||
|
||||
|
||||
def _is_pdf(filename: str) -> bool:
|
||||
return filename.lower().endswith(".pdf")
|
||||
|
||||
|
||||
def _pick_suffix(name: str) -> str:
|
||||
lower = name.lower()
|
||||
for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
|
||||
if lower.endswith(ext):
|
||||
return ".jpeg" if ext == ".jpg" else ext
|
||||
return ".png"
|
||||
|
||||
|
||||
def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
|
||||
"""Pull every embedded image out of a PDF.
|
||||
|
||||
Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
|
||||
Per-page and per-image failures are logged and skipped -- one bad
|
||||
image must not fail the whole document.
|
||||
"""
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
out: list[tuple[int, int, str, bytes]] = []
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to open %s for image extraction",
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
return out
|
||||
|
||||
for page_idx, page in enumerate(reader.pages):
|
||||
try:
|
||||
images = list(page.images)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to enumerate images on page %d of %s",
|
||||
page_idx + 1,
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
for img_idx, img in enumerate(images):
|
||||
try:
|
||||
name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
|
||||
data = img.data
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to read image %d on page %d of %s",
|
||||
img_idx,
|
||||
page_idx + 1,
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
out.append((page_idx + 1, img_idx, name, data))
|
||||
return out
|
||||
|
||||
|
||||
async def _describe_one(
|
||||
page_number: int,
|
||||
ordinal: int,
|
||||
name: str,
|
||||
sha256: str,
|
||||
data: bytes,
|
||||
vision_llm: Any,
|
||||
semaphore: asyncio.Semaphore,
|
||||
ocr_runner: OcrRunner | None,
|
||||
) -> PictureDescription | None:
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
|
||||
|
||||
suffix = _pick_suffix(name)
|
||||
# NamedTemporaryFile + delete=False because the vision-LLM helper
|
||||
# and the OCR runner each open the path themselves; we clean up in
|
||||
# the finally. Same temp file feeds both, which is correct: vision
|
||||
# LLM and OCR are looking at the same image, just asking different
|
||||
# questions of it.
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(data)
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
async with semaphore:
|
||||
tasks: list[Awaitable[Any]] = [
|
||||
parse_image_for_description(tmp_path, name, vision_llm),
|
||||
]
|
||||
if ocr_runner is not None:
|
||||
tasks.append(ocr_runner(tmp_path, name))
|
||||
|
||||
# return_exceptions=True so a failure in one branch (most
|
||||
# often OCR) doesn't poison the other.
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
description_result = results[0]
|
||||
if isinstance(description_result, BaseException):
|
||||
logger.warning(
|
||||
"Vision LLM failed for image %s on page %d, skipping",
|
||||
name,
|
||||
page_number,
|
||||
exc_info=description_result,
|
||||
)
|
||||
return None
|
||||
description = str(description_result)
|
||||
|
||||
ocr_text: str | None = None
|
||||
if ocr_runner is not None and len(results) > 1:
|
||||
ocr_result = results[1]
|
||||
if isinstance(ocr_result, BaseException):
|
||||
logger.warning(
|
||||
"Per-image OCR failed for image %s on page %d, "
|
||||
"omitting OCR field for this image",
|
||||
name,
|
||||
page_number,
|
||||
exc_info=ocr_result,
|
||||
)
|
||||
else:
|
||||
stripped = str(ocr_result).strip()
|
||||
# Empty OCR (or whitespace-only) means the OCR engine
|
||||
# found no text in this image. Record that as None so
|
||||
# the rendered block doesn't include a useless empty tag.
|
||||
ocr_text = stripped or None
|
||||
finally:
|
||||
with contextlib.suppress(OSError):
|
||||
Path(tmp_path).unlink()
|
||||
|
||||
return PictureDescription(
|
||||
page_number=page_number,
|
||||
ordinal_in_page=ordinal,
|
||||
name=name,
|
||||
sha256=sha256,
|
||||
description=description,
|
||||
ocr_text=ocr_text,
|
||||
)
|
||||
|
||||
|
||||
async def describe_pictures(
|
||||
file_path: str,
|
||||
filename: str,
|
||||
vision_llm: Any,
|
||||
*,
|
||||
ocr_runner: OcrRunner | None = None,
|
||||
) -> PictureExtractionResult:
|
||||
"""Extract embedded images from a document and describe each via vision LLM.
|
||||
|
||||
When ``ocr_runner`` is provided, each image is also passed to it
|
||||
(in parallel with the vision LLM) and the returned text is recorded
|
||||
in :attr:`PictureDescription.ocr_text`. The runner is typically a
|
||||
closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
|
||||
the same OCR engine that processes standalone image uploads
|
||||
(Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
|
||||
giving per-image OCR attribution alongside the page-level OCR that
|
||||
the parser already does.
|
||||
|
||||
Currently PDF-only. For non-PDF documents this returns an empty
|
||||
result and the caller should leave the parser's markdown untouched.
|
||||
"""
|
||||
|
||||
result = PictureExtractionResult()
|
||||
if not _is_pdf(filename) or vision_llm is None:
|
||||
return result
|
||||
|
||||
raw_images = _extract_pdf_images(file_path)
|
||||
if not raw_images:
|
||||
return result
|
||||
|
||||
seen_hashes: set[str] = set()
|
||||
eligible: list[tuple[int, int, str, str, bytes]] = []
|
||||
for page_number, ordinal, name, data in raw_images:
|
||||
if len(data) > _MAX_IMAGE_BYTES:
|
||||
result.skipped_too_large += 1
|
||||
continue
|
||||
if len(data) < _MIN_IMAGE_BYTES:
|
||||
result.skipped_too_small += 1
|
||||
continue
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
if sha in seen_hashes:
|
||||
result.skipped_duplicate += 1
|
||||
continue
|
||||
seen_hashes.add(sha)
|
||||
eligible.append((page_number, ordinal, name, sha, data))
|
||||
|
||||
if not eligible:
|
||||
return result
|
||||
|
||||
semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
|
||||
tasks = [
|
||||
_describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
|
||||
for (p, o, n, sha, d) in eligible
|
||||
]
|
||||
descriptions = await asyncio.gather(*tasks)
|
||||
for desc in descriptions:
|
||||
if desc is None:
|
||||
result.failed += 1
|
||||
else:
|
||||
result.descriptions.append(desc)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rendering: build the per-image markdown block + inject inline.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _format_image_block(
|
||||
name: str,
|
||||
description: str,
|
||||
ocr_text: str | None = None,
|
||||
) -> str:
|
||||
"""Render the per-image block as a horizontal-rule-delimited section.
|
||||
|
||||
Why no blockquote / no raw HTML / no XML?
|
||||
-----------------------------------------
|
||||
We tried each in turn and each failed in the document viewer:
|
||||
|
||||
- **Raw HTML / XML** (``<image>...</image>``): unknown elements
|
||||
have no render rules in Streamdown or PlateJS, so the content
|
||||
survives in the markdown source but is invisible to humans.
|
||||
- **Blockquote with nested blocks**: nested fenced code blocks,
|
||||
bullet lists, numbered lists, tables -- any *block* element
|
||||
inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
|
||||
/ remark, dropping everything after it onto the document level.
|
||||
The vision LLM happily produces bulleted descriptions, so this
|
||||
hit the viewer in practice.
|
||||
|
||||
A horizontal-rule-delimited section, by contrast, contains only
|
||||
standard top-level markdown -- bold labels and free-form body --
|
||||
so the description's native markdown (lists, prose, tables) all
|
||||
renders natively in every renderer.
|
||||
|
||||
Layout (OCR section omitted when ``ocr_text`` is None/empty):
|
||||
|
||||
---
|
||||
|
||||
**Embedded image:** `MM-130-a.jpeg`
|
||||
|
||||
**OCR text:**
|
||||
Slice 24 / 60
|
||||
L
|
||||
R
|
||||
|
||||
**Visual description:**
|
||||
|
||||
- Axial contrast-enhanced CT showing a large cystic mass...
|
||||
- Mass effect on the adjacent stomach.
|
||||
|
||||
---
|
||||
|
||||
Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
|
||||
is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
|
||||
|
||||
Returned with leading and trailing blank-line padding so the rules
|
||||
never merge with adjacent paragraphs after splicing.
|
||||
"""
|
||||
|
||||
parts: list[str] = [f"**Embedded image:** `{name}`"]
|
||||
|
||||
if ocr_text and ocr_text.strip():
|
||||
# Bold "OCR text:" label with trailing two spaces (=> <br>) so
|
||||
# the first OCR line sits directly under the label rather than
|
||||
# forcing a paragraph break that some renderers would style
|
||||
# differently. Subsequent OCR lines also use trailing two spaces
|
||||
# for hard breaks, so multi-line OCR renders line-by-line
|
||||
# without needing a (fragile) fenced code block.
|
||||
ocr_clean_lines = [
|
||||
ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
|
||||
]
|
||||
parts.append("")
|
||||
parts.append("**OCR text:** ")
|
||||
for i, raw in enumerate(ocr_clean_lines):
|
||||
suffix = "" if i == len(ocr_clean_lines) - 1 else " "
|
||||
parts.append(f"{raw}{suffix}")
|
||||
|
||||
parts.append("")
|
||||
parts.append("**Visual description:**")
|
||||
parts.append("")
|
||||
parts.append(description.strip())
|
||||
|
||||
body = "\n".join(parts)
|
||||
# Wrap with blank lines + horizontal rules so the block is clearly
|
||||
# delimited from surrounding paragraphs and survives splicing into
|
||||
# the middle of any markdown stream.
|
||||
return "\n\n---\n\n" + body + "\n\n---\n\n"
|
||||
|
||||
|
||||
# Patterns we'll try to splice into. Each pattern captures the
|
||||
# original-PDF filename when one is available (group 1).
|
||||
#
|
||||
# Replace-style markers (the matched span is substituted with our block
|
||||
# because it carries no useful content of its own):
|
||||
#
|
||||
# 1. Docling's image placeholder followed by an "Image: <filename>"
|
||||
# caption line. This is what our medxpertqa renderer produces:
|
||||
# reportlab places the JPEG, then a caption, and Docling outputs
|
||||
# the placeholder + caption.
|
||||
# 2. Docling's image placeholder alone (filename unknown -- we fall
|
||||
# back to pypdf's name).
|
||||
# 3. A bare "Image: <filename>" caption line with no preceding
|
||||
# placeholder. Rare in practice, but covers parsers that drop the
|
||||
# placeholder entirely.
|
||||
_PLACEHOLDER_WITH_CAPTION = re.compile(
|
||||
r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_PLACEHOLDER_ONLY = re.compile(
|
||||
r"<!--\s*image\s*-->",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_CAPTION_ONLY = re.compile(
|
||||
r"^[ \t]*Image:\s*(\S+)\s*$",
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Append-after marker (the matched span is preserved verbatim and our
|
||||
# block is inserted immediately after it):
|
||||
#
|
||||
# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
|
||||
# Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
|
||||
# The figure's own contents -- chart bar values, axis labels,
|
||||
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures
|
||||
# -- are themselves specialist OCR output, so we keep them and add
|
||||
# our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
|
||||
# optional attributes like ``<figure id="...">``; ``re.DOTALL``
|
||||
# lets ``.`` cross the newlines inside the block.
|
||||
_FIGURE_BLOCK = re.compile(
|
||||
r"<figure\b[^>]*>.*?</figure>",
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _replace_one_match(
|
||||
markdown: str,
|
||||
pattern: re.Pattern[str],
|
||||
descriptions: list[PictureDescription],
|
||||
desc_idx: int,
|
||||
) -> tuple[str, int]:
|
||||
"""Replace the first occurrence of ``pattern`` with the next image block.
|
||||
|
||||
Returns the new markdown and the new ``desc_idx`` (advanced if a
|
||||
replacement happened, unchanged otherwise).
|
||||
"""
|
||||
|
||||
if desc_idx >= len(descriptions):
|
||||
return markdown, desc_idx
|
||||
|
||||
match = pattern.search(markdown)
|
||||
if not match:
|
||||
return markdown, desc_idx
|
||||
|
||||
desc = descriptions[desc_idx]
|
||||
captured_name: str | None = None
|
||||
if match.groups():
|
||||
captured_name = match.group(1)
|
||||
name = captured_name or desc.name
|
||||
block = _format_image_block(name, desc.description, desc.ocr_text)
|
||||
|
||||
new_markdown = markdown[: match.start()] + block + markdown[match.end():]
|
||||
return new_markdown, desc_idx + 1
|
||||
|
||||
|
||||
def _splice_after_figures(
|
||||
markdown: str,
|
||||
descriptions: list[PictureDescription],
|
||||
desc_idx: int,
|
||||
) -> tuple[str, int]:
|
||||
"""Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
|
||||
|
||||
Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
|
||||
premium) wrap each figure / chart / inline table in this tag and
|
||||
carry their own OCR of the figure's text content inside it. That
|
||||
content is useful on its own, so we keep the original block
|
||||
verbatim and add our vision-LLM block right after it -- giving
|
||||
retrieval both signals in the same chunk.
|
||||
|
||||
Descriptions are matched to figures in document order (first
|
||||
description -> first figure, etc.). All splice points are computed
|
||||
upfront with :func:`re.finditer` and applied in REVERSE order so
|
||||
earlier offsets stay valid as the markdown grows. Returns the
|
||||
advanced ``desc_idx`` for the caller's leftover-handling.
|
||||
"""
|
||||
|
||||
if desc_idx >= len(descriptions):
|
||||
return markdown, desc_idx
|
||||
|
||||
matches = list(_FIGURE_BLOCK.finditer(markdown))
|
||||
if not matches:
|
||||
return markdown, desc_idx
|
||||
|
||||
n_to_splice = min(len(matches), len(descriptions) - desc_idx)
|
||||
if n_to_splice <= 0:
|
||||
return markdown, desc_idx
|
||||
|
||||
out = markdown
|
||||
# Walk in reverse so each splice's end-offset still points at the
|
||||
# right place in the (still-mutating) string.
|
||||
for i in range(n_to_splice - 1, -1, -1):
|
||||
match = matches[i]
|
||||
desc = descriptions[desc_idx + i]
|
||||
block = _format_image_block(desc.name, desc.description, desc.ocr_text)
|
||||
out = out[: match.end()] + block + out[match.end():]
|
||||
|
||||
return out, desc_idx + n_to_splice
|
||||
|
||||
|
||||
def inject_descriptions_inline(
|
||||
markdown: str,
|
||||
result: PictureExtractionResult,
|
||||
) -> tuple[str, int]:
|
||||
"""Splice per-image markdown blocks into the document at image positions.
|
||||
|
||||
Walks the markdown left-to-right, consuming descriptions in order.
|
||||
Tries two splicing strategies, in this order:
|
||||
|
||||
1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
|
||||
layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
|
||||
premium). The figure block carries the parser's own OCR of the
|
||||
figure -- we preserve it and add our vision-LLM block right
|
||||
after.
|
||||
2. **Replace** for Docling-style markers, in priority order:
|
||||
|
||||
- ``<!-- image -->`` followed by ``Image: <filename>`` caption,
|
||||
- ``<!-- image -->`` placeholder alone,
|
||||
- bare ``Image: <filename>`` caption.
|
||||
|
||||
A document typically uses one style or the other (depending on
|
||||
which parser produced its markdown), so the two paths don't fight
|
||||
each other in practice. When they do co-occur, figures are
|
||||
consumed first.
|
||||
|
||||
Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
|
||||
that were placed inline. The caller decides what to do with any
|
||||
leftover descriptions (typically: append them at the end).
|
||||
"""
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown, 0
|
||||
|
||||
descriptions = result.descriptions
|
||||
desc_idx = 0
|
||||
out = markdown
|
||||
|
||||
# Step 1: layout-aware figures. One-shot batch -- finds ALL
|
||||
# <figure> blocks, splices in document order until we exhaust
|
||||
# either side.
|
||||
out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
|
||||
|
||||
# Step 2: Docling-style replacement markers. One match per
|
||||
# iteration, so a doc that has both a figure (consumed above) and
|
||||
# a Docling placeholder (consumed below) still works.
|
||||
while desc_idx < len(descriptions):
|
||||
before_idx = desc_idx
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _PLACEHOLDER_ONLY, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _CAPTION_ONLY, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
# No more positions to splice into.
|
||||
break
|
||||
|
||||
return out, desc_idx
|
||||
|
||||
|
||||
def render_appended_section(
|
||||
descriptions: list[PictureDescription],
|
||||
*,
|
||||
skip_notes: PictureExtractionResult | None = None,
|
||||
heading: str = "## Image Content (vision-LLM extracted)",
|
||||
) -> str:
|
||||
"""Render leftover descriptions as an appended section.
|
||||
|
||||
Used as a fallback when not every description could be inlined
|
||||
(either because the parser produced no detectable image markers,
|
||||
or because there were more extracted images than markers).
|
||||
"""
|
||||
|
||||
if not descriptions and not skip_notes:
|
||||
return ""
|
||||
|
||||
parts: list[str] = ["", heading, ""]
|
||||
for desc in descriptions:
|
||||
parts.append(
|
||||
_format_image_block(desc.name, desc.description, desc.ocr_text)
|
||||
)
|
||||
parts.append("")
|
||||
|
||||
if skip_notes is not None:
|
||||
notes: list[str] = []
|
||||
if skip_notes.skipped_too_large:
|
||||
notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
|
||||
if skip_notes.skipped_too_small:
|
||||
notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
|
||||
if skip_notes.skipped_duplicate:
|
||||
notes.append(f"{skip_notes.skipped_duplicate} duplicate")
|
||||
if skip_notes.failed:
|
||||
notes.append(f"{skip_notes.failed} failed")
|
||||
if notes:
|
||||
parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def merge_descriptions_into_markdown(
|
||||
markdown: str,
|
||||
result: PictureExtractionResult,
|
||||
) -> str:
|
||||
"""Top-level: inline what we can, append what's left over.
|
||||
|
||||
This is the function the ETL pipeline actually calls. It guarantees
|
||||
that no successfully-described image is silently dropped: anything
|
||||
we can't splice inline gets appended at the end with a heading
|
||||
that makes it clear those came from the document but weren't
|
||||
location-matched.
|
||||
"""
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown
|
||||
|
||||
new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
|
||||
leftover = result.descriptions[n_inlined:]
|
||||
|
||||
if not leftover:
|
||||
return new_markdown
|
||||
|
||||
# Distinguish in the heading whether NONE were inlined (parser
|
||||
# produced no markers at all) vs SOME (mismatched count).
|
||||
heading = (
|
||||
"## Image Content (vision-LLM extracted)"
|
||||
if n_inlined == 0
|
||||
else "## Image Content (additional, no inline marker found)"
|
||||
)
|
||||
section = render_appended_section(leftover, heading=heading)
|
||||
if not section:
|
||||
return new_markdown
|
||||
return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PictureDescription",
|
||||
"PictureExtractionResult",
|
||||
"describe_pictures",
|
||||
"inject_descriptions_inline",
|
||||
"merge_descriptions_into_markdown",
|
||||
"render_appended_section",
|
||||
]
|
||||
|
|
@ -77,10 +77,16 @@ class DoclingService:
|
|||
# Create pipeline options with version-safe attribute checking
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
|
||||
# Disable OCR (user request)
|
||||
# Enable OCR so text-in-image (chart axes, ECG annotations,
|
||||
# lab tables embedded as images, scanned pages, etc.) is
|
||||
# lifted into the main markdown stream. This pairs with the
|
||||
# vision-LLM picture-description pass downstream — OCR
|
||||
# captures literal text; vision LLM captures the visual
|
||||
# content. Together they give a faithful representation of
|
||||
# PDFs that mix text and images.
|
||||
if hasattr(pipeline_options, "do_ocr"):
|
||||
pipeline_options.do_ocr = False
|
||||
logger.info("⚠️ OCR disabled by user request")
|
||||
pipeline_options.do_ocr = True
|
||||
logger.info("✅ OCR enabled for embedded text-in-image extraction")
|
||||
else:
|
||||
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
||||
|
||||
|
|
|
|||
|
|
@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
|||
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
|
||||
from app.etl_pipeline.etl_document import EtlRequest
|
||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||
from app.etl_pipeline.file_classifier import (
|
||||
FileCategory,
|
||||
classify_file as etl_classify,
|
||||
)
|
||||
|
||||
await _notify(ctx, "parsing", "Processing file")
|
||||
await ctx.task_logger.log_task_progress(
|
||||
|
|
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
|||
{"processing_stage": "extracting"},
|
||||
)
|
||||
|
||||
# Fetch the vision LLM whenever the operator opts in. The ETL
|
||||
# pipeline decides what to do with it: image files run through the
|
||||
# vision LLM directly; document files (PDFs) get per-image
|
||||
# descriptions appended via picture_describer.
|
||||
vision_llm = None
|
||||
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
|
||||
if ctx.use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||
|
|
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
|||
|
||||
await _notify(ctx, "parsing", "Extracting content")
|
||||
|
||||
etl_result = await EtlPipelineService().extract(
|
||||
# Document files (PDF, docx, etc.) get vision LLM treatment too:
|
||||
# the ETL pipeline appends a per-image description section when
|
||||
# vision_llm is provided. See picture_describer.describe_pictures.
|
||||
vision_llm = None
|
||||
if ctx.use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||
|
||||
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||
EtlRequest(
|
||||
file_path=ctx.file_path,
|
||||
filename=ctx.filename,
|
||||
|
|
@ -418,8 +427,12 @@ async def _extract_file_content(
|
|||
billable_pages = estimated_pages * mode.page_multiplier
|
||||
await page_limit_service.check_page_limit(user_id, billable_pages)
|
||||
|
||||
# Vision LLM is provided to the ETL pipeline for any file category
|
||||
# when the operator opts in. Image files run through it directly;
|
||||
# document files (PDFs) get per-image descriptions appended via
|
||||
# picture_describer.
|
||||
vision_llm = None
|
||||
if use_vision_llm and category == FileCategory.IMAGE:
|
||||
if use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(session, search_space_id)
|
||||
|
|
|
|||
|
|
@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
|
|||
assert result.content_type == "document"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document path with vision LLM: per-image descriptions are appended
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _fake_extraction_result(*descriptions):
|
||||
from app.etl_pipeline.picture_describer import (
|
||||
PictureDescription,
|
||||
PictureExtractionResult,
|
||||
)
|
||||
|
||||
return PictureExtractionResult(
|
||||
descriptions=[
|
||||
PictureDescription(
|
||||
page_number=d["page"],
|
||||
ordinal_in_page=d.get("ordinal", 0),
|
||||
name=d["name"],
|
||||
sha256=d.get("sha", "deadbeef"),
|
||||
description=d["desc"],
|
||||
)
|
||||
for d in descriptions
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
|
||||
"""A PDF with an `<!-- image -->` placeholder + caption gets the
|
||||
block spliced inline (no orphaned ``## Image Content`` section).
|
||||
|
||||
This is the headline scenario for the medxpertqa benchmark: the
|
||||
image content lives in the same chunk as the surrounding case text
|
||||
so retrieval pulls the question, image, and answer options together.
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {
|
||||
"content": (
|
||||
"# MedXpertQA-MM MM-130\n\n"
|
||||
"## Clinical case\n\nA 44-year-old man...\n\n"
|
||||
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
|
||||
"## Answer choices\n\nA) ...\n"
|
||||
)
|
||||
}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
extraction = _fake_extraction_result(
|
||||
{
|
||||
"page": 1,
|
||||
"name": "Im0",
|
||||
"desc": "Axial CT showing a large cystic mass.",
|
||||
}
|
||||
)
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(return_value=extraction),
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
result = await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
md = result.markdown_content
|
||||
# The placeholder + caption are gone, replaced by a horizontal-
|
||||
# rule-delimited section with the captioned filename.
|
||||
assert "<!-- image -->" not in md
|
||||
assert "Image: MM-130-a.jpeg" not in md
|
||||
assert "**Embedded image:** `MM-130-a.jpeg`" in md
|
||||
assert "**Visual description:**" in md
|
||||
assert "Axial CT showing a large cystic mass." in md
|
||||
# No OCR section -- our fake_extraction_result has no ocr_text,
|
||||
# and the format omits the section when there's no text to show.
|
||||
assert "**OCR text:**" not in md
|
||||
# No raw HTML / XML tags or blockquote wrapping leak.
|
||||
assert "<image" not in md
|
||||
assert "> **Embedded image:**" not in md
|
||||
# No appended section -- everything went inline.
|
||||
assert "## Image Content" not in md
|
||||
# Surrounding case text + answer options are preserved.
|
||||
assert "A 44-year-old man..." in md
|
||||
assert "## Answer choices" in md
|
||||
assert "A) ..." in md
|
||||
|
||||
|
||||
async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
|
||||
"""When parser markdown has no image markers, descriptions get appended.
|
||||
|
||||
This is the fallback path for parsers that drop image placeholders
|
||||
entirely. The image content still ends up in the markdown -- just
|
||||
in a clearly-labeled section rather than inline.
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {
|
||||
"content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
|
||||
}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
extraction = _fake_extraction_result(
|
||||
{"page": 1, "name": "Im0", "desc": "An image description."}
|
||||
)
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(return_value=extraction),
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
result = await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
md = result.markdown_content
|
||||
assert "# Parsed PDF text" in md
|
||||
assert "## Image Content (vision-LLM extracted)" in md
|
||||
assert "**Embedded image:** `Im0`" in md
|
||||
assert "An image description." in md
|
||||
|
||||
|
||||
async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""No vision LLM -> parser markdown returned as-is."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
describe_mock = mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(),
|
||||
)
|
||||
|
||||
result = await EtlPipelineService().extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Parsed PDF text"
|
||||
assert "<image" not in result.markdown_content
|
||||
describe_mock.assert_not_called()
|
||||
|
||||
|
||||
async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""A pypdf or vision LLM blow-up never fails the document upload."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
result = await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Parsed PDF text"
|
||||
assert result.etl_service == "DOCLING"
|
||||
|
||||
|
||||
async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""Vision-LLM-enabled PDF with zero extracted images is unchanged."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# Just text, no images"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
empty = _fake_extraction_result()
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(return_value=empty),
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
result = await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
assert result.markdown_content == "# Just text, no images"
|
||||
assert "<image" not in result.markdown_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-image OCR runner: wiring + behaviour
|
||||
#
|
||||
# When extracting a PDF with a vision LLM, the ETL service must ALSO
|
||||
# pass an ``ocr_runner`` to picture_describer. The runner is a closure
|
||||
# that re-feeds each extracted image through a vision-LLM-less
|
||||
# EtlPipelineService -- i.e. the same OCR engine that handles
|
||||
# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
|
||||
# at each embedded image, with the text attached to the inline block.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""The ETL service must wire an ocr_runner kwarg to describe_pictures."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
describe_mock = mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=mocker.AsyncMock(return_value=_fake_extraction_result()),
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
describe_mock.assert_awaited_once()
|
||||
_, kwargs = describe_mock.await_args
|
||||
assert "ocr_runner" in kwargs
|
||||
assert callable(kwargs["ocr_runner"])
|
||||
|
||||
|
||||
async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""The OCR runner closure should re-extract each image via the parser.
|
||||
|
||||
We capture the runner that the ETL service passes to
|
||||
describe_pictures, invoke it with a fake image path, and assert
|
||||
that Docling was called with that image. This proves the closure
|
||||
is wired to a vision-LLM-less sub-pipeline (otherwise it would
|
||||
recurse into the vision LLM and never hit the OCR engine).
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
image_file = tmp_path / "Im0.png"
|
||||
image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {
|
||||
"content": "Slice 24 / 60 L R"
|
||||
}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
async def capture_runner(*args, **kwargs):
|
||||
captured["runner"] = kwargs["ocr_runner"]
|
||||
return _fake_extraction_result()
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=capture_runner,
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
runner = captured["runner"]
|
||||
ocr_text = await runner(str(image_file), "Im0.png")
|
||||
|
||||
assert ocr_text == "Slice 24 / 60 L R"
|
||||
# Docling was invoked twice in total: once for the PDF, once for
|
||||
# the image we re-fed via the runner.
|
||||
assert fake_docling.process_document.await_count == 2
|
||||
|
||||
|
||||
async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""Unsupported image format → runner returns empty string, doesn't raise.
|
||||
|
||||
Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
|
||||
Docling can't load. We don't want an unsupported format on ONE
|
||||
embedded image to spoil the whole PDF extraction; the runner
|
||||
should swallow the EtlUnsupportedFileError and return "" so the
|
||||
image gets a description but no OCR tag.
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
weird_image = tmp_path / "Im0.jp2" # JPEG2000, unlikely to be supported
|
||||
weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
)
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
async def capture_runner(*args, **kwargs):
|
||||
captured["runner"] = kwargs["ocr_runner"]
|
||||
return _fake_extraction_result()
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.picture_describer.describe_pictures",
|
||||
new=capture_runner,
|
||||
)
|
||||
|
||||
fake_llm = mocker.MagicMock()
|
||||
await EtlPipelineService(vision_llm=fake_llm).extract(
|
||||
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
|
||||
)
|
||||
|
||||
runner = captured["runner"]
|
||||
ocr_text = await runner(str(weird_image), "Im0.jp2")
|
||||
|
||||
assert ocr_text == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Processing Mode enum tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -0,0 +1,967 @@
|
|||
"""Unit tests for the picture_describer module.
|
||||
|
||||
Covers:
|
||||
|
||||
- :func:`describe_pictures` -- the PDF image walker + per-image vision
|
||||
LLM call (structured output split into ``ocr_text`` and
|
||||
``description``);
|
||||
- :func:`inject_descriptions_inline` -- in-place replacement of image
|
||||
placeholders / captions in the parser markdown;
|
||||
- :func:`merge_descriptions_into_markdown` -- the top-level helper
|
||||
that inlines what it can and appends what it can't;
|
||||
- :func:`render_appended_section` -- the appended-fallback renderer.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.etl_pipeline.picture_describer import (
|
||||
PictureDescription,
|
||||
PictureExtractionResult,
|
||||
describe_pictures,
|
||||
inject_descriptions_inline,
|
||||
merge_descriptions_into_markdown,
|
||||
render_appended_section,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _make_image_obj(name: str, data: bytes):
|
||||
"""Mimic pypdf's ImageFile object shape for the bits we use."""
|
||||
img = MagicMock()
|
||||
img.name = name
|
||||
img.data = data
|
||||
return img
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# describe_pictures: short-circuits
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
|
||||
"""Non-PDF files are silently no-op'd; we don't try to extract images."""
|
||||
docx_file = tmp_path / "report.docx"
|
||||
docx_file.write_bytes(b"PK fake docx")
|
||||
|
||||
fake_llm = AsyncMock()
|
||||
result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
|
||||
|
||||
assert result.descriptions == []
|
||||
assert result.skipped_too_large == 0
|
||||
fake_llm.ainvoke.assert_not_called()
|
||||
|
||||
|
||||
async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
|
||||
"""If the caller didn't provide a vision LLM, we no-op even for PDFs."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", None)
|
||||
assert result.descriptions == []
|
||||
|
||||
|
||||
async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
|
||||
"""A PDF that pypdf can open but contains zero images returns empty."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
fake_llm = AsyncMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert result.descriptions == []
|
||||
fake_llm.ainvoke.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# describe_pictures: happy paths
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
|
||||
"""Every eligible image gets exactly one description-only vision call."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
|
||||
page1 = MagicMock(images=[img_a])
|
||||
page2 = MagicMock(images=[img_b])
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [page1, page2]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
parse_mock = mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(side_effect=["Description A", "Description B"]),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 2
|
||||
by_name = {d.name: d.description for d in result.descriptions}
|
||||
assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
|
||||
assert all(d.page_number in (1, 2) for d in result.descriptions)
|
||||
assert parse_mock.await_count == 2
|
||||
|
||||
|
||||
async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
|
||||
"""An image that appears N times in the PDF is described once."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
|
||||
img = _make_image_obj("logo.png", payload)
|
||||
page1 = MagicMock(images=[img])
|
||||
page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
|
||||
page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [page1, page2, page3]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
parse_mock = mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="Logo desc"),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.skipped_duplicate == 2
|
||||
assert parse_mock.await_count == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
|
||||
"""Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
|
||||
big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
|
||||
page = MagicMock(images=[tiny, big])
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [page]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
parse_mock = mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="CT scan"),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].name == "ct.jpeg"
|
||||
assert result.skipped_too_small == 1
|
||||
assert parse_mock.await_count == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
|
||||
"""Images larger than the vision LLM's per-image cap are skipped."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
|
||||
ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
page = MagicMock(images=[huge, ok])
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [page]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
parse_mock = mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="OK image"),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].name == "ok.jpeg"
|
||||
assert result.skipped_too_large == 1
|
||||
assert parse_mock.await_count == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
|
||||
"""A vision LLM failure on one image must not kill the whole document."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
|
||||
img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
|
||||
page = MagicMock(images=[img_a, img_b])
|
||||
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [page]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].description == "Success"
|
||||
assert result.failed == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
|
||||
"""A malformed PDF that pypdf can't open returns an empty result."""
|
||||
pdf_file = tmp_path / "broken.pdf"
|
||||
pdf_file.write_bytes(b"not a pdf")
|
||||
|
||||
mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
|
||||
assert result.descriptions == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# inject_descriptions_inline: replacement patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _desc(name="Im0", description="A CT scan."):
|
||||
return PictureDescription(
|
||||
page_number=1,
|
||||
ordinal_in_page=0,
|
||||
name=name,
|
||||
sha256="aa",
|
||||
description=description,
|
||||
)
|
||||
|
||||
|
||||
def test_inject_no_op_when_no_descriptions():
|
||||
markdown = "# Title\n\nbody text\n"
|
||||
result = PictureExtractionResult()
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
assert out == markdown
|
||||
assert n == 0
|
||||
|
||||
|
||||
def test_inject_replaces_placeholder_with_caption():
|
||||
"""`<!-- image -->` + `Image: <name>` together becomes one block.
|
||||
|
||||
This is the most common medxpertqa case: our renderer puts a caption
|
||||
line right below the embedded JPEG, and Docling preserves both.
|
||||
"""
|
||||
markdown = (
|
||||
"# Case\n\n"
|
||||
"Clinical text...\n\n"
|
||||
"<!-- image -->\nImage: MM-130-a.jpeg\n\n"
|
||||
"Answer choices: A) ...\n"
|
||||
)
|
||||
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "<!-- image -->" not in out
|
||||
assert "Image: MM-130-a.jpeg" not in out # caption consumed
|
||||
# New format: horizontal-rule-delimited section with "Embedded
|
||||
# image:" anchor and named "Visual description:" section. No
|
||||
# blockquote wrapping -- nested blocks (lists, code, tables) inside
|
||||
# a blockquote are silently dropped by Streamdown / remark.
|
||||
assert "**Embedded image:** `MM-130-a.jpeg`" in out
|
||||
assert "**Visual description:**" in out
|
||||
assert "A CT scan." in out
|
||||
# Block is delimited by horizontal rules so it stands out from
|
||||
# surrounding paragraphs.
|
||||
assert "\n---\n" in out
|
||||
# No OCR section -- this fixture has no ocr_text on its descriptions.
|
||||
assert "**OCR text:**" not in out
|
||||
# No raw HTML tags / blockquote prefixes leak.
|
||||
assert "<image" not in out
|
||||
assert "</image>" not in out
|
||||
assert "> **Embedded image:**" not in out # we no longer wrap in `>`
|
||||
# Surrounding context is preserved.
|
||||
assert "Clinical text..." in out
|
||||
assert "Answer choices: A) ..." in out
|
||||
|
||||
|
||||
def test_inject_uses_pypdf_name_when_no_caption():
|
||||
"""`<!-- image -->` alone uses the pypdf-given name as the attribute."""
|
||||
markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
|
||||
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "**Embedded image:** `Im0`" in out
|
||||
|
||||
|
||||
def test_inject_replaces_bare_caption():
|
||||
"""A bare `Image: <name>` line (no placeholder) still gets replaced."""
|
||||
markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
|
||||
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "**Embedded image:** `scan.jpeg`" in out
|
||||
assert "Image: scan.jpeg" not in out
|
||||
|
||||
|
||||
def test_inject_handles_multiple_images_in_order():
|
||||
"""Two placeholders + two descriptions: each consumed in document order."""
|
||||
markdown = (
|
||||
"Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
|
||||
"Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
PictureDescription(
|
||||
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
|
||||
description="Desc A",
|
||||
),
|
||||
PictureDescription(
|
||||
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
|
||||
description="Desc B",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 2
|
||||
assert "**Embedded image:** `a.jpeg`" in out
|
||||
assert "**Embedded image:** `b.jpeg`" in out
|
||||
assert out.index("a.jpeg") < out.index("b.jpeg")
|
||||
assert "Desc A" in out and "Desc B" in out
|
||||
|
||||
|
||||
def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
|
||||
"""Three descriptions, one marker -> only one inlined, two leftover."""
|
||||
markdown = "Just one <!-- image --> here.\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
_desc(name="Im0", description="First"),
|
||||
_desc(name="Im1", description="Second"),
|
||||
_desc(name="Im2", description="Third"),
|
||||
]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "**Embedded image:** `Im0`" in out
|
||||
assert "**Embedded image:** `Im1`" not in out
|
||||
|
||||
|
||||
def test_inject_returns_zero_when_no_markers_present():
|
||||
"""Markdown with no image markers at all returns the input unchanged."""
|
||||
markdown = "# Title\n\nJust text. No images mentioned at all.\n"
|
||||
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 0
|
||||
assert out == markdown
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# render_appended_section
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_render_appended_empty_when_nothing_passed():
|
||||
assert render_appended_section([]) == ""
|
||||
|
||||
|
||||
def test_render_appended_renders_each_image_as_block():
|
||||
descriptions = [
|
||||
_desc(name="MM-130-a.jpeg", description="CT scan"),
|
||||
_desc(name="MM-130-b.jpeg", description="Bar chart"),
|
||||
]
|
||||
rendered = render_appended_section(descriptions)
|
||||
assert "## Image Content (vision-LLM extracted)" in rendered
|
||||
assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
|
||||
assert "CT scan" in rendered
|
||||
assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
|
||||
assert "Bar chart" in rendered
|
||||
# Each image block is delimited by horizontal rules.
|
||||
assert rendered.count("\n---\n") >= 2
|
||||
# No raw HTML / XML / blockquote prefixes.
|
||||
assert "<image" not in rendered
|
||||
assert "> **Embedded image:**" not in rendered
|
||||
assert "**OCR text:**" not in rendered
|
||||
|
||||
|
||||
def test_render_appended_includes_skip_notes():
|
||||
descriptions = [_desc()]
|
||||
skip_result = PictureExtractionResult(
|
||||
descriptions=descriptions,
|
||||
skipped_too_small=2,
|
||||
skipped_too_large=1,
|
||||
skipped_duplicate=3,
|
||||
failed=1,
|
||||
)
|
||||
rendered = render_appended_section(descriptions, skip_notes=skip_result)
|
||||
assert "_Note:" in rendered
|
||||
assert "2 too small" in rendered
|
||||
assert "1 too large" in rendered
|
||||
assert "3 duplicate" in rendered
|
||||
assert "1 failed" in rendered
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# merge_descriptions_into_markdown: top-level
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_merge_inlines_when_marker_present():
|
||||
markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
|
||||
result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
|
||||
|
||||
out = merge_descriptions_into_markdown(markdown, result)
|
||||
|
||||
assert "**Embedded image:** `scan.jpeg`" in out
|
||||
# Nothing leaked into an appended section -- we should NOT see the
|
||||
# appended-section heading because everything went inline.
|
||||
assert "## Image Content" not in out
|
||||
|
||||
|
||||
def test_merge_appends_when_no_marker_present():
|
||||
"""Zero markers means everything goes into an appended section."""
|
||||
markdown = "Pure text doc, no image markers.\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="An image desc.")]
|
||||
)
|
||||
|
||||
out = merge_descriptions_into_markdown(markdown, result)
|
||||
|
||||
assert "Pure text doc" in out
|
||||
assert "## Image Content (vision-LLM extracted)" in out
|
||||
assert "**Embedded image:** `Im0`" in out
|
||||
|
||||
|
||||
def test_merge_appends_leftovers_with_distinct_heading():
|
||||
"""One marker, two descriptions -> one inline, second appended under
|
||||
a heading that signals it's a leftover.
|
||||
"""
|
||||
markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
_desc(name="Im0", description="First"),
|
||||
_desc(name="Im1", description="Second"),
|
||||
]
|
||||
)
|
||||
|
||||
out = merge_descriptions_into_markdown(markdown, result)
|
||||
|
||||
assert "**Embedded image:** `a.jpeg`" in out # inlined
|
||||
assert "## Image Content (additional, no inline marker found)" in out
|
||||
assert "**Embedded image:** `Im1`" in out # appended
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# describe_pictures: ocr_runner integration
|
||||
#
|
||||
# These tests cover the per-image OCR side-channel: when the caller
|
||||
# supplies an ``ocr_runner`` callable, each extracted image is sent
|
||||
# both to the vision LLM (visual description) and to the OCR runner
|
||||
# (text-in-image), in parallel. The OCR text -- if any -- is recorded
|
||||
# on the PictureDescription and rendered in the inline block.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
|
||||
"""When an ocr_runner is provided, it's invoked once per eligible image."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img_a, img_b])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(side_effect=["Visual A", "Visual B"]),
|
||||
)
|
||||
ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(
|
||||
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
|
||||
)
|
||||
|
||||
assert ocr_runner.await_count == 2
|
||||
by_name = {d.name: d.ocr_text for d in result.descriptions}
|
||||
assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
|
||||
|
||||
|
||||
async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""Vision LLM and OCR run concurrently per image, not sequentially.
|
||||
|
||||
We verify this by recording call timestamps: if both finish within
|
||||
a small window relative to the per-call sleep, they ran in parallel.
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
sleep_each = 0.05 # 50ms per call
|
||||
|
||||
async def slow_vision(*args, **kwargs):
|
||||
await asyncio.sleep(sleep_each)
|
||||
return "Visual"
|
||||
|
||||
async def slow_ocr(*args, **kwargs):
|
||||
await asyncio.sleep(sleep_each)
|
||||
return "OCR"
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=slow_vision,
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
started = time.perf_counter()
|
||||
result = await describe_pictures(
|
||||
str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
|
||||
)
|
||||
elapsed = time.perf_counter() - started
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].ocr_text == "OCR"
|
||||
# Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
|
||||
# Be generous with the bound so we're not flaky on slow CI.
|
||||
assert elapsed < 1.5 * sleep_each, (
|
||||
f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
|
||||
)
|
||||
|
||||
|
||||
async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
|
||||
"""Empty / whitespace-only OCR result is normalised to None.
|
||||
|
||||
This means the rendered image block won't carry an empty
|
||||
"OCR text" section for images that contain no text at all
|
||||
(e.g. a clean radiograph).
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="A radiograph."),
|
||||
)
|
||||
ocr_runner = AsyncMock(return_value=" \n \n")
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(
|
||||
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
|
||||
)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].ocr_text is None
|
||||
|
||||
|
||||
async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
|
||||
"""An OCR runner exception must not kill the description for that image.
|
||||
|
||||
OCR is supplementary; the vision LLM's description is the primary
|
||||
payload. If OCR blows up we drop the OCR field for that image and
|
||||
keep the description.
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="A radiograph."),
|
||||
)
|
||||
ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(
|
||||
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
|
||||
)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].description == "A radiograph."
|
||||
assert result.descriptions[0].ocr_text is None
|
||||
assert result.failed == 0 # the IMAGE didn't fail; only its OCR did
|
||||
|
||||
|
||||
async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""If the vision LLM fails, the image is skipped even if OCR succeeded.
|
||||
|
||||
The inline block's primary purpose is the visual description; an
|
||||
OCR-only block would be misleading (it'd look like the vision
|
||||
pipeline ran when it didn't), so we treat vision failure as image
|
||||
failure regardless of OCR outcome.
|
||||
"""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(side_effect=RuntimeError("vision blew up")),
|
||||
)
|
||||
ocr_runner = AsyncMock(return_value="OCR text")
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(
|
||||
str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
|
||||
)
|
||||
|
||||
assert result.descriptions == []
|
||||
assert result.failed == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
|
||||
tmp_path, mocker
|
||||
):
|
||||
"""Backward compat: omitting ocr_runner produces description-only blocks."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
|
||||
fake_reader = MagicMock()
|
||||
fake_reader.pages = [MagicMock(images=[img])]
|
||||
mocker.patch("pypdf.PdfReader", return_value=fake_reader)
|
||||
|
||||
mocker.patch(
|
||||
"app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
|
||||
new=AsyncMock(return_value="Visual"),
|
||||
)
|
||||
|
||||
fake_llm = MagicMock()
|
||||
result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
|
||||
|
||||
assert len(result.descriptions) == 1
|
||||
assert result.descriptions[0].ocr_text is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L R 10mm"):
|
||||
return PictureDescription(
|
||||
page_number=1,
|
||||
ordinal_in_page=0,
|
||||
name=name,
|
||||
sha256="aa",
|
||||
description=description,
|
||||
ocr_text=ocr_text,
|
||||
)
|
||||
|
||||
|
||||
def test_inject_renders_ocr_section_when_ocr_text_present():
|
||||
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc_with_ocr(name="Im0", ocr_text="L R 10mm")]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "**Embedded image:** `scan.jpeg`" in out
|
||||
assert "**OCR text:**" in out
|
||||
assert "L R 10mm" in out
|
||||
# OCR section comes before the visual description (literal text
|
||||
# first, interpretation second).
|
||||
assert out.index("**OCR text:**") < out.index("**Visual description:**")
|
||||
# Critical: no nested-block constructs (fenced code, blockquote)
|
||||
# that previous formats relied on -- both broke in Streamdown /
|
||||
# PlateJS by escaping their container and dropping content.
|
||||
assert "```" not in out
|
||||
assert "> **" not in out
|
||||
|
||||
|
||||
def test_inject_renders_multiline_ocr_with_hard_breaks():
|
||||
"""Multi-line OCR uses trailing-two-spaces hard breaks so each
|
||||
line renders on its own row, without needing a fragile fenced
|
||||
code block or blockquote wrapper."""
|
||||
markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
|
||||
ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
|
||||
)
|
||||
|
||||
out, _ = inject_descriptions_inline(markdown, result)
|
||||
|
||||
# Every OCR line is present.
|
||||
for line in ("Slice 24 / 60", "L", "R", "10 mm"):
|
||||
assert line in out
|
||||
# Non-last OCR lines get the trailing two-space hard break.
|
||||
assert "Slice 24 / 60 \n" in out
|
||||
assert "\nL \n" in out
|
||||
assert "\nR \n" in out
|
||||
# Last OCR line must NOT carry the two-space hard break (no stray <br>).
|
||||
assert "10 mm \n" not in out
|
||||
assert "10 mm\n" in out
|
||||
|
||||
|
||||
def test_render_appended_renders_ocr_section_when_ocr_text_present():
|
||||
descriptions = [
|
||||
_desc_with_ocr(
|
||||
name="MM-130-a.jpeg",
|
||||
description="Axial CT.",
|
||||
ocr_text="Slice 24 / 60",
|
||||
),
|
||||
]
|
||||
rendered = render_appended_section(descriptions)
|
||||
|
||||
assert "**OCR text:**" in rendered
|
||||
assert "Slice 24 / 60" in rendered
|
||||
assert "Axial CT." in rendered
|
||||
|
||||
|
||||
def test_render_omits_ocr_section_when_ocr_text_is_none():
|
||||
descriptions = [_desc(name="Im0", description="A clean radiograph.")]
|
||||
rendered = render_appended_section(descriptions)
|
||||
|
||||
assert "**Embedded image:** `Im0`" in rendered
|
||||
assert "**OCR text:**" not in rendered
|
||||
assert "**Visual description:**" in rendered
|
||||
# No raw HTML / blockquote prefixes.
|
||||
assert "<image" not in rendered
|
||||
assert "> **" not in rendered
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
|
||||
#
|
||||
# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
|
||||
# premium both emit ``<figure>...</figure>`` blocks that already contain
|
||||
# the parser's own OCR of the figure (chart bar values, axis labels,
|
||||
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
|
||||
# That parser-side content is useful for retrieval on its own, so we
|
||||
# PRESERVE the figure verbatim and append our vision-LLM block
|
||||
# immediately after rather than substituting for it.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_inject_appends_block_after_figure_preserving_parser_content():
|
||||
"""Figure block stays intact; vision-LLM block goes right after it."""
|
||||
markdown = (
|
||||
"Some narrative text.\n\n"
|
||||
"<figure>\n\n"
|
||||
"Republican\n68\nDemocrat\n30\n"
|
||||
"\n</figure>\n\n"
|
||||
"Following paragraph.\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
# Original figure is preserved verbatim -- the parser's OCR'd
|
||||
# numbers must still be searchable.
|
||||
assert "<figure>" in out
|
||||
assert "</figure>" in out
|
||||
assert "Republican" in out and "68" in out
|
||||
# Our vision-LLM block follows the figure, not before / inside it.
|
||||
assert "**Embedded image:** `Im0`" in out
|
||||
assert "Bar chart of party ID." in out
|
||||
figure_close = out.index("</figure>")
|
||||
embedded_at = out.index("**Embedded image:** `Im0`")
|
||||
assert figure_close < embedded_at, "block must be appended AFTER </figure>"
|
||||
# Surrounding narrative is preserved.
|
||||
assert "Some narrative text." in out
|
||||
assert "Following paragraph." in out
|
||||
|
||||
|
||||
def test_inject_handles_multiple_figures_in_document_order():
|
||||
"""N figures + N descriptions: each pair lands in the right place."""
|
||||
markdown = (
|
||||
"Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
|
||||
"Between\n\n<figure>\nChart B bars\n</figure>\n\n"
|
||||
"End.\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
PictureDescription(
|
||||
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
|
||||
description="Description of chart A.",
|
||||
),
|
||||
PictureDescription(
|
||||
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
|
||||
description="Description of chart B.",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 2
|
||||
# Both figures preserved; both descriptions inlined; order matches.
|
||||
assert out.count("<figure>") == 2
|
||||
assert out.count("</figure>") == 2
|
||||
assert "Description of chart A." in out
|
||||
assert "Description of chart B." in out
|
||||
assert out.index("Description of chart A.") < out.index(
|
||||
"Description of chart B."
|
||||
)
|
||||
# Each description appears AFTER its corresponding </figure>.
|
||||
first_close = out.index("</figure>")
|
||||
assert first_close < out.index("Description of chart A.")
|
||||
second_close = out.index("</figure>", first_close + 1)
|
||||
assert second_close < out.index("Description of chart B.")
|
||||
|
||||
|
||||
def test_inject_figures_with_attributes_and_nested_tags():
|
||||
"""``<figure>`` with attributes and nested tags is matched and preserved."""
|
||||
markdown = (
|
||||
'<figure id="fig-3" class="chart">\n'
|
||||
'<figcaption>Source: Pew Research</figcaption>\n'
|
||||
"<table><tr><td>Republican</td><td>57</td></tr></table>\n"
|
||||
"</figure>\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="Survey table.")]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
# All nested HTML is preserved (chunking will pick it up).
|
||||
assert 'id="fig-3"' in out
|
||||
assert "<figcaption>Source: Pew Research</figcaption>" in out
|
||||
assert "<table>" in out and "Republican" in out and "57" in out
|
||||
# Our block sits after the closing tag.
|
||||
assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
|
||||
|
||||
|
||||
def test_inject_figures_more_descriptions_than_figures_returns_remaining():
|
||||
"""Three descriptions, one figure -> one inlined, two left for caller."""
|
||||
markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
_desc(name="Im0", description="First desc."),
|
||||
_desc(name="Im1", description="Second desc."),
|
||||
_desc(name="Im2", description="Third desc."),
|
||||
]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
assert "First desc." in out
|
||||
# Leftovers are the caller's job; inject_descriptions_inline does
|
||||
# not append them on its own.
|
||||
assert "Second desc." not in out
|
||||
assert "Third desc." not in out
|
||||
|
||||
|
||||
def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
|
||||
"""Two figures, one description -> first figure enriched, second left raw."""
|
||||
markdown = (
|
||||
"<figure>\nfigure 1 content\n</figure>\n"
|
||||
"<figure>\nfigure 2 content\n</figure>\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="Only description.")]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 1
|
||||
# Both figures still present; only the first one was enriched.
|
||||
assert out.count("<figure>") == 2
|
||||
assert "Only description." in out
|
||||
# Second figure has no embedded-image block immediately after it.
|
||||
second_open = out.index("<figure>", out.index("<figure>") + 1)
|
||||
second_close = out.index("</figure>", second_open)
|
||||
after_second = out[second_close:]
|
||||
assert "**Embedded image:**" not in after_second
|
||||
|
||||
|
||||
def test_merge_inlines_at_figure_boundary():
|
||||
"""Top-level helper does the right thing with figures (no leftover section)."""
|
||||
markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="Bar chart.")]
|
||||
)
|
||||
|
||||
out = merge_descriptions_into_markdown(markdown, result)
|
||||
|
||||
# Inline succeeded -> no appended-section heading.
|
||||
assert "## Image Content" not in out
|
||||
assert "Bar chart." in out
|
||||
assert "<figure>" in out and "</figure>" in out
|
||||
|
||||
|
||||
def test_inject_figures_then_falls_through_to_docling_marker():
|
||||
"""Mixed-marker doc: figure consumed first, then Docling placeholder.
|
||||
|
||||
Defensive -- single docs are usually one parser's output, but if a
|
||||
pipeline ever stitches two parsers' markdowns together the inliner
|
||||
should still place each description.
|
||||
"""
|
||||
markdown = (
|
||||
"<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
|
||||
"Later in the doc:\n\n"
|
||||
"<!-- image -->\nImage: scan.jpeg\n\n"
|
||||
"End.\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
_desc(name="Im0", description="Chart description."),
|
||||
_desc(name="Im1", description="Scan description."),
|
||||
]
|
||||
)
|
||||
|
||||
out, n = inject_descriptions_inline(markdown, result)
|
||||
|
||||
assert n == 2
|
||||
# Figure preserved + augmented.
|
||||
assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
|
||||
assert "Chart description." in out
|
||||
# Docling placeholder + caption replaced.
|
||||
assert "<!-- image -->" not in out
|
||||
assert "Image: scan.jpeg" not in out
|
||||
assert "**Embedded image:** `scan.jpeg`" in out
|
||||
assert "Scan description." in out
|
||||
146
surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
Normal file
146
surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
"""Unit tests for the vision_llm parser helpers.
|
||||
|
||||
Two helpers exist:
|
||||
|
||||
- :func:`parse_with_vision_llm` -- single-shot for standalone image
|
||||
uploads (.png/.jpg/etc). Returns combined markdown (description +
|
||||
verbatim OCR mixed) since the image *is* the document.
|
||||
- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
|
||||
visual description only; OCR is the ETL service's job.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_with_vision_llm: legacy single-shot path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
|
||||
"""Standalone image uploads still go through the combined-markdown path."""
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||
|
||||
img = tmp_path / "scan.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = "# A scan of something."
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
|
||||
assert out == "# A scan of something."
|
||||
fake_llm.ainvoke.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
|
||||
"""An empty model response raises rather than silently returning blanks."""
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||
|
||||
img = tmp_path / "scan.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = ""
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
with pytest.raises(ValueError, match="empty content"):
|
||||
await parse_with_vision_llm(str(img), "scan.png", fake_llm)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse_image_for_description: per-image-in-PDF, description only
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_parse_image_for_description_returns_description(tmp_path):
|
||||
"""Description-only path returns the model's markdown unchanged."""
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
|
||||
|
||||
img = tmp_path / "scan.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = "Axial CT showing a large cystic mass."
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
out = await parse_image_for_description(str(img), "scan.png", fake_llm)
|
||||
assert out == "Axial CT showing a large cystic mass."
|
||||
|
||||
|
||||
async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
|
||||
"""The prompt explicitly tells the model NOT to transcribe text.
|
||||
|
||||
This is the contract that lets us drop OCR from the response: the
|
||||
ETL pipeline already has the text (from page-level OCR), so asking
|
||||
the vision LLM for it would be redundant cost.
|
||||
"""
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
|
||||
|
||||
img = tmp_path / "scan.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = "A description"
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
await parse_image_for_description(str(img), "scan.png", fake_llm)
|
||||
|
||||
# The prompt is the first text part of the message we sent.
|
||||
sent_messages = fake_llm.ainvoke.call_args.args[0]
|
||||
prompt_text = sent_messages[0].content[0]["text"].lower()
|
||||
assert "describe what this image visually depicts" in prompt_text
|
||||
assert "do not transcribe text" in prompt_text
|
||||
|
||||
|
||||
async def test_parse_image_for_description_rejects_empty(tmp_path):
|
||||
"""Empty response surfaces as ValueError so the caller can skip the image."""
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
|
||||
|
||||
img = tmp_path / "scan.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.content = " " # whitespace-only counts as empty
|
||||
fake_llm = AsyncMock()
|
||||
fake_llm.ainvoke.return_value = fake_response
|
||||
|
||||
with pytest.raises(ValueError, match="empty content"):
|
||||
await parse_image_for_description(str(img), "scan.png", fake_llm)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Image size + extension validation (shared by both paths)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_image_to_data_url_rejects_oversized(tmp_path):
|
||||
"""Images larger than 5 MB raise before any LLM call is made."""
|
||||
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
|
||||
|
||||
big = tmp_path / "huge.png"
|
||||
big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
|
||||
|
||||
with pytest.raises(ValueError, match="Image too large"):
|
||||
_image_to_data_url(str(big))
|
||||
|
||||
|
||||
def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
|
||||
"""Unknown extensions raise rather than guessing a MIME type."""
|
||||
from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
|
||||
|
||||
weird = tmp_path / "scan.xyz"
|
||||
weird.write_bytes(b"\x00" * 100)
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported image extension"):
|
||||
_image_to_data_url(str(weird))
|
||||
Loading…
Add table
Add a link
Reference in a new issue