mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-19 18:45:15 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
|
|
@ -134,12 +134,92 @@ class EtlPipelineService:
|
|||
else:
|
||||
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
||||
|
||||
# When the operator opts into vision-LLM at ingest, walk the
|
||||
# original file's embedded images and append a structured
|
||||
# "Image Content" section. The parser's own OCR (Docling
|
||||
# do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
|
||||
# image; this side handles the *visual* description which the
|
||||
# parsers all drop today.
|
||||
content = await self._maybe_append_picture_descriptions(request, content)
|
||||
|
||||
return EtlResult(
|
||||
markdown_content=content,
|
||||
etl_service=etl_service,
|
||||
content_type="document",
|
||||
)
|
||||
|
||||
async def _maybe_append_picture_descriptions(
|
||||
self, request: EtlRequest, markdown: str
|
||||
) -> str:
|
||||
if self._vision_llm is None:
|
||||
return markdown
|
||||
|
||||
from app.etl_pipeline.picture_describer import (
|
||||
describe_pictures,
|
||||
merge_descriptions_into_markdown,
|
||||
)
|
||||
|
||||
# Per-image OCR runner: re-feed each extracted image through
|
||||
# the ETL pipeline *as a standalone image* (no vision LLM, so
|
||||
# the IMAGE branch falls through to the document parser, which
|
||||
# OCRs the image with the configured backend -- Docling /
|
||||
# Azure DI / LlamaCloud). This gives us per-image OCR text
|
||||
# attached to the inline image block, in addition to the
|
||||
# page-level OCR that the parser already merges into the main
|
||||
# markdown stream. The fresh sub-service gets vision_llm=None
|
||||
# so this call cannot recurse back into picture_describer.
|
||||
async def _ocr_image(image_path: str, image_name: str) -> str:
|
||||
try:
|
||||
sub = EtlPipelineService(vision_llm=None)
|
||||
ocr_result = await sub.extract(
|
||||
EtlRequest(file_path=image_path, filename=image_name)
|
||||
)
|
||||
except (
|
||||
EtlUnsupportedFileError,
|
||||
EtlServiceUnavailableError,
|
||||
) as exc:
|
||||
# Common case: the configured ETL service can't OCR
|
||||
# this image format (or no service is configured at
|
||||
# all). Don't spam warnings -- just no OCR for it.
|
||||
logging.debug(
|
||||
"Skipping per-image OCR for %s: %s", image_name, exc
|
||||
)
|
||||
return ""
|
||||
return ocr_result.markdown_content
|
||||
|
||||
try:
|
||||
result = await describe_pictures(
|
||||
request.file_path,
|
||||
request.filename,
|
||||
self._vision_llm,
|
||||
ocr_runner=_ocr_image,
|
||||
)
|
||||
except Exception:
|
||||
# Picture description is additive; never let it fail an
|
||||
# otherwise-successful document extraction.
|
||||
logging.warning(
|
||||
"Picture description failed for %s, returning parser output unchanged",
|
||||
request.filename,
|
||||
exc_info=True,
|
||||
)
|
||||
return markdown
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown
|
||||
|
||||
merged = merge_descriptions_into_markdown(markdown, result)
|
||||
logging.info(
|
||||
"Vision LLM described %d image(s) in %s "
|
||||
"(skipped: %d small / %d large / %d duplicate, %d failed)",
|
||||
len(result.descriptions),
|
||||
request.filename,
|
||||
result.skipped_too_small,
|
||||
result.skipped_too_large,
|
||||
result.skipped_duplicate,
|
||||
result.failed,
|
||||
)
|
||||
return merged
|
||||
|
||||
async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
|
||||
"""Try Azure Document Intelligence first (when configured) then LlamaCloud.
|
||||
|
||||
|
|
|
|||
|
|
@ -4,12 +4,34 @@ import os
|
|||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
|
||||
# A standalone image IS the document, so we want everything: visual
|
||||
# content plus any text the model can read off it. The output is
|
||||
# combined markdown that the chunker treats as the full document body.
|
||||
_PROMPT = (
|
||||
"Describe this image in markdown. "
|
||||
"Transcribe any visible text verbatim. "
|
||||
"Be concise but complete — let the image content guide the level of detail."
|
||||
)
|
||||
|
||||
# Per-image-in-PDF prompt. Here the image is *inside* a larger
|
||||
# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
|
||||
# already running OCR over the whole page — including text rendered
|
||||
# into images. So we explicitly tell the model NOT to transcribe text
|
||||
# and to focus only on visual interpretation. This avoids paying
|
||||
# output tokens for OCR content the ETL pipeline already captured.
|
||||
_DESCRIPTION_PROMPT = (
|
||||
"Describe what this image visually depicts in concise markdown. "
|
||||
"Focus on visual content — anatomy, structures, charts, diagrams, "
|
||||
"spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
|
||||
"histology slide), and any clinically or structurally relevant "
|
||||
"findings.\n\n"
|
||||
"Do NOT transcribe text from the image. Any text in the image "
|
||||
"(axis labels, annotations, scale bars, lab values, etc.) is "
|
||||
"already extracted by a separate OCR pipeline; duplicating it "
|
||||
"here would be redundant. Stick to the visual interpretation."
|
||||
)
|
||||
|
||||
_MAX_IMAGE_BYTES = (
|
||||
5 * 1024 * 1024
|
||||
) # 5 MB (Anthropic Claude's limit, the most restrictive)
|
||||
|
|
@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
|
|||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
data_url = _image_to_data_url(file_path)
|
||||
async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": _PROMPT},
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
]
|
||||
)
|
||||
|
|
@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
|||
if not text or not text.strip():
|
||||
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||
return text.strip()
|
||||
|
||||
|
||||
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||
"""Single-shot: returns combined markdown for a standalone image upload.
|
||||
|
||||
Used when the operator uploads an image file directly (jpg/png/etc).
|
||||
The image is the document, so the prompt asks for both visual
|
||||
description and verbatim text in one go.
|
||||
"""
|
||||
data_url = _image_to_data_url(file_path)
|
||||
return await _invoke_vision(llm, _PROMPT, data_url, filename)
|
||||
|
||||
|
||||
async def parse_image_for_description(
|
||||
file_path: str, filename: str, llm
|
||||
) -> str:
|
||||
"""Visual-description-only call for per-image-in-PDF use.
|
||||
|
||||
Used by ``picture_describer`` when an image is embedded inside a
|
||||
larger document. Returns a markdown description of what the image
|
||||
visually depicts; deliberately does NOT include text-in-image OCR
|
||||
because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
|
||||
already running OCR over the entire page and would duplicate that
|
||||
text content.
|
||||
"""
|
||||
data_url = _image_to_data_url(file_path)
|
||||
return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"parse_image_for_description",
|
||||
"parse_with_vision_llm",
|
||||
]
|
||||
|
|
|
|||
678
surfsense_backend/app/etl_pipeline/picture_describer.py
Normal file
678
surfsense_backend/app/etl_pipeline/picture_describer.py
Normal file
|
|
@ -0,0 +1,678 @@
|
|||
"""Extract embedded images from PDFs, describe them, and inject the
|
||||
descriptions inline into the parser's markdown.
|
||||
|
||||
When the operator passes ``use_vision_llm=True`` for a PDF, the document
|
||||
parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
|
||||
but mostly drop the actual image content -- a CT scan inside a clinical
|
||||
PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
|
||||
and the caption text below it.
|
||||
|
||||
This module fills that gap. After the document parser produces markdown
|
||||
text, we:
|
||||
|
||||
1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
|
||||
image (deduped by sha256, size-capped to match the vision LLM's own
|
||||
limits).
|
||||
2. Run the vision LLM on each unique image (visual description) and,
|
||||
in parallel when an OCR runner is provided, re-feed the same image
|
||||
through the ETL service for per-image OCR.
|
||||
3. **Inject** a horizontal-rule-delimited markdown section -- with
|
||||
named "OCR text" and "Visual description" sub-sections -- where the
|
||||
image actually appears in the parser's markdown. Two splice modes,
|
||||
chosen by which marker the parser emitted:
|
||||
|
||||
- **Replace** Docling-style ``<!-- image -->`` placeholders (and an
|
||||
optional ``Image: <filename>`` caption line). The placeholder
|
||||
carries no useful content of its own, so we substitute our block
|
||||
for it.
|
||||
- **Append after** layout-aware ``<figure>...</figure>`` blocks
|
||||
(Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
|
||||
already contain parser-extracted chart values / OCR'd labels /
|
||||
captions, which are themselves useful for retrieval -- so we
|
||||
PRESERVE the figure verbatim and add our vision-LLM block
|
||||
immediately after it. The chunk then contains both the parser's
|
||||
structured numbers AND the VLM's semantic interpretation.
|
||||
|
||||
Either way, the image content stays in context with the surrounding
|
||||
document body rather than getting orphaned at the end -- crucial for
|
||||
retrieval, where a single chunk should contain the question, the
|
||||
image content, and the answer options together.
|
||||
|
||||
If no placeholders, figures, or captions can be matched (e.g. an
|
||||
unusual parser output), we fall back to appending an
|
||||
``## Image Content`` section so no image content is silently lost.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Type alias for the OCR callback. Takes (file_path, filename), returns
|
||||
# the OCR'd markdown text -- or empty string if no text was found, or
|
||||
# raises if OCR failed unrecoverably (which the describer catches and
|
||||
# treats as "no OCR for this image" rather than failing the whole doc).
|
||||
OcrRunner = Callable[[str, str], Awaitable[str]]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Bound how many vision LLM calls we make in parallel for a single
|
||||
# document. Vision models are typically rate-limited; 4 concurrent
|
||||
# calls is a safe default that respects most provider limits while
|
||||
# keeping wall-clock manageable for image-heavy PDFs.
|
||||
_VISION_CONCURRENCY = 4
|
||||
|
||||
# Match parse_with_vision_llm's per-image cap so we don't even attempt
|
||||
# images that the vision LLM would reject anyway (Anthropic's 5 MB
|
||||
# limit is the most restrictive among the major providers).
|
||||
_MAX_IMAGE_BYTES = 5 * 1024 * 1024
|
||||
|
||||
# Skip degenerate images: tracking pixels, very small decorative dots,
|
||||
# scanner-introduced artefacts. We can't cheaply check pixel dimensions
|
||||
# without decoding the image, so we approximate: anything under 1 KB is
|
||||
# almost certainly not informative content.
|
||||
_MIN_IMAGE_BYTES = 1024
|
||||
|
||||
|
||||
@dataclass
|
||||
class PictureDescription:
|
||||
"""A single extracted image with its visual description and (optionally) OCR.
|
||||
|
||||
Two content fields by design, each produced by the *right* tool:
|
||||
|
||||
- ``description``: the vision LLM's visual interpretation. What the
|
||||
image depicts (anatomy, charts, layout, etc.) -- the semantic
|
||||
content that only a vision model can produce.
|
||||
- ``ocr_text``: text-in-image extracted by re-feeding the image
|
||||
through the configured ETL service (Docling/Azure DI/LlamaCloud)
|
||||
*as if it were a standalone image upload*. Specialist OCR engine,
|
||||
per-image attribution, no vision LLM tokens spent on text. None
|
||||
when no OCR was requested or OCR found no text.
|
||||
"""
|
||||
|
||||
page_number: int # 1-indexed
|
||||
ordinal_in_page: int # 0-indexed within the page
|
||||
name: str # name pypdf assigned (e.g. "Im0")
|
||||
sha256: str # hash of the raw image bytes
|
||||
description: str # visual description (markdown)
|
||||
ocr_text: str | None = None # OCR text from the ETL service, if any
|
||||
|
||||
|
||||
@dataclass
|
||||
class PictureExtractionResult:
|
||||
"""Aggregate result of extracting all pictures from a document."""
|
||||
|
||||
descriptions: list[PictureDescription] = field(default_factory=list)
|
||||
skipped_too_small: int = 0
|
||||
skipped_too_large: int = 0
|
||||
skipped_duplicate: int = 0
|
||||
failed: int = 0
|
||||
|
||||
@property
|
||||
def has_content(self) -> bool:
|
||||
return bool(self.descriptions)
|
||||
|
||||
|
||||
def _is_pdf(filename: str) -> bool:
|
||||
return filename.lower().endswith(".pdf")
|
||||
|
||||
|
||||
def _pick_suffix(name: str) -> str:
|
||||
lower = name.lower()
|
||||
for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
|
||||
if lower.endswith(ext):
|
||||
return ".jpeg" if ext == ".jpg" else ext
|
||||
return ".png"
|
||||
|
||||
|
||||
def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
|
||||
"""Pull every embedded image out of a PDF.
|
||||
|
||||
Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
|
||||
Per-page and per-image failures are logged and skipped -- one bad
|
||||
image must not fail the whole document.
|
||||
"""
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
out: list[tuple[int, int, str, bytes]] = []
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to open %s for image extraction",
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
return out
|
||||
|
||||
for page_idx, page in enumerate(reader.pages):
|
||||
try:
|
||||
images = list(page.images)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to enumerate images on page %d of %s",
|
||||
page_idx + 1,
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
for img_idx, img in enumerate(images):
|
||||
try:
|
||||
name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
|
||||
data = img.data
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pypdf failed to read image %d on page %d of %s",
|
||||
img_idx,
|
||||
page_idx + 1,
|
||||
file_path,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
out.append((page_idx + 1, img_idx, name, data))
|
||||
return out
|
||||
|
||||
|
||||
async def _describe_one(
|
||||
page_number: int,
|
||||
ordinal: int,
|
||||
name: str,
|
||||
sha256: str,
|
||||
data: bytes,
|
||||
vision_llm: Any,
|
||||
semaphore: asyncio.Semaphore,
|
||||
ocr_runner: OcrRunner | None,
|
||||
) -> PictureDescription | None:
|
||||
from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
|
||||
|
||||
suffix = _pick_suffix(name)
|
||||
# NamedTemporaryFile + delete=False because the vision-LLM helper
|
||||
# and the OCR runner each open the path themselves; we clean up in
|
||||
# the finally. Same temp file feeds both, which is correct: vision
|
||||
# LLM and OCR are looking at the same image, just asking different
|
||||
# questions of it.
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(data)
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
async with semaphore:
|
||||
tasks: list[Awaitable[Any]] = [
|
||||
parse_image_for_description(tmp_path, name, vision_llm),
|
||||
]
|
||||
if ocr_runner is not None:
|
||||
tasks.append(ocr_runner(tmp_path, name))
|
||||
|
||||
# return_exceptions=True so a failure in one branch (most
|
||||
# often OCR) doesn't poison the other.
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
description_result = results[0]
|
||||
if isinstance(description_result, BaseException):
|
||||
logger.warning(
|
||||
"Vision LLM failed for image %s on page %d, skipping",
|
||||
name,
|
||||
page_number,
|
||||
exc_info=description_result,
|
||||
)
|
||||
return None
|
||||
description = str(description_result)
|
||||
|
||||
ocr_text: str | None = None
|
||||
if ocr_runner is not None and len(results) > 1:
|
||||
ocr_result = results[1]
|
||||
if isinstance(ocr_result, BaseException):
|
||||
logger.warning(
|
||||
"Per-image OCR failed for image %s on page %d, "
|
||||
"omitting OCR field for this image",
|
||||
name,
|
||||
page_number,
|
||||
exc_info=ocr_result,
|
||||
)
|
||||
else:
|
||||
stripped = str(ocr_result).strip()
|
||||
# Empty OCR (or whitespace-only) means the OCR engine
|
||||
# found no text in this image. Record that as None so
|
||||
# the rendered block doesn't include a useless empty tag.
|
||||
ocr_text = stripped or None
|
||||
finally:
|
||||
with contextlib.suppress(OSError):
|
||||
Path(tmp_path).unlink()
|
||||
|
||||
return PictureDescription(
|
||||
page_number=page_number,
|
||||
ordinal_in_page=ordinal,
|
||||
name=name,
|
||||
sha256=sha256,
|
||||
description=description,
|
||||
ocr_text=ocr_text,
|
||||
)
|
||||
|
||||
|
||||
async def describe_pictures(
|
||||
file_path: str,
|
||||
filename: str,
|
||||
vision_llm: Any,
|
||||
*,
|
||||
ocr_runner: OcrRunner | None = None,
|
||||
) -> PictureExtractionResult:
|
||||
"""Extract embedded images from a document and describe each via vision LLM.
|
||||
|
||||
When ``ocr_runner`` is provided, each image is also passed to it
|
||||
(in parallel with the vision LLM) and the returned text is recorded
|
||||
in :attr:`PictureDescription.ocr_text`. The runner is typically a
|
||||
closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
|
||||
the same OCR engine that processes standalone image uploads
|
||||
(Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
|
||||
giving per-image OCR attribution alongside the page-level OCR that
|
||||
the parser already does.
|
||||
|
||||
Currently PDF-only. For non-PDF documents this returns an empty
|
||||
result and the caller should leave the parser's markdown untouched.
|
||||
"""
|
||||
|
||||
result = PictureExtractionResult()
|
||||
if not _is_pdf(filename) or vision_llm is None:
|
||||
return result
|
||||
|
||||
raw_images = _extract_pdf_images(file_path)
|
||||
if not raw_images:
|
||||
return result
|
||||
|
||||
seen_hashes: set[str] = set()
|
||||
eligible: list[tuple[int, int, str, str, bytes]] = []
|
||||
for page_number, ordinal, name, data in raw_images:
|
||||
if len(data) > _MAX_IMAGE_BYTES:
|
||||
result.skipped_too_large += 1
|
||||
continue
|
||||
if len(data) < _MIN_IMAGE_BYTES:
|
||||
result.skipped_too_small += 1
|
||||
continue
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
if sha in seen_hashes:
|
||||
result.skipped_duplicate += 1
|
||||
continue
|
||||
seen_hashes.add(sha)
|
||||
eligible.append((page_number, ordinal, name, sha, data))
|
||||
|
||||
if not eligible:
|
||||
return result
|
||||
|
||||
semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
|
||||
tasks = [
|
||||
_describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
|
||||
for (p, o, n, sha, d) in eligible
|
||||
]
|
||||
descriptions = await asyncio.gather(*tasks)
|
||||
for desc in descriptions:
|
||||
if desc is None:
|
||||
result.failed += 1
|
||||
else:
|
||||
result.descriptions.append(desc)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rendering: build the per-image markdown block + inject inline.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _format_image_block(
|
||||
name: str,
|
||||
description: str,
|
||||
ocr_text: str | None = None,
|
||||
) -> str:
|
||||
"""Render the per-image block as a horizontal-rule-delimited section.
|
||||
|
||||
Why no blockquote / no raw HTML / no XML?
|
||||
-----------------------------------------
|
||||
We tried each in turn and each failed in the document viewer:
|
||||
|
||||
- **Raw HTML / XML** (``<image>...</image>``): unknown elements
|
||||
have no render rules in Streamdown or PlateJS, so the content
|
||||
survives in the markdown source but is invisible to humans.
|
||||
- **Blockquote with nested blocks**: nested fenced code blocks,
|
||||
bullet lists, numbered lists, tables -- any *block* element
|
||||
inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
|
||||
/ remark, dropping everything after it onto the document level.
|
||||
The vision LLM happily produces bulleted descriptions, so this
|
||||
hit the viewer in practice.
|
||||
|
||||
A horizontal-rule-delimited section, by contrast, contains only
|
||||
standard top-level markdown -- bold labels and free-form body --
|
||||
so the description's native markdown (lists, prose, tables) all
|
||||
renders natively in every renderer.
|
||||
|
||||
Layout (OCR section omitted when ``ocr_text`` is None/empty):
|
||||
|
||||
---
|
||||
|
||||
**Embedded image:** `MM-130-a.jpeg`
|
||||
|
||||
**OCR text:**
|
||||
Slice 24 / 60
|
||||
L
|
||||
R
|
||||
|
||||
**Visual description:**
|
||||
|
||||
- Axial contrast-enhanced CT showing a large cystic mass...
|
||||
- Mass effect on the adjacent stomach.
|
||||
|
||||
---
|
||||
|
||||
Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
|
||||
is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
|
||||
|
||||
Returned with leading and trailing blank-line padding so the rules
|
||||
never merge with adjacent paragraphs after splicing.
|
||||
"""
|
||||
|
||||
parts: list[str] = [f"**Embedded image:** `{name}`"]
|
||||
|
||||
if ocr_text and ocr_text.strip():
|
||||
# Bold "OCR text:" label with trailing two spaces (=> <br>) so
|
||||
# the first OCR line sits directly under the label rather than
|
||||
# forcing a paragraph break that some renderers would style
|
||||
# differently. Subsequent OCR lines also use trailing two spaces
|
||||
# for hard breaks, so multi-line OCR renders line-by-line
|
||||
# without needing a (fragile) fenced code block.
|
||||
ocr_clean_lines = [
|
||||
ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
|
||||
]
|
||||
parts.append("")
|
||||
parts.append("**OCR text:** ")
|
||||
for i, raw in enumerate(ocr_clean_lines):
|
||||
suffix = "" if i == len(ocr_clean_lines) - 1 else " "
|
||||
parts.append(f"{raw}{suffix}")
|
||||
|
||||
parts.append("")
|
||||
parts.append("**Visual description:**")
|
||||
parts.append("")
|
||||
parts.append(description.strip())
|
||||
|
||||
body = "\n".join(parts)
|
||||
# Wrap with blank lines + horizontal rules so the block is clearly
|
||||
# delimited from surrounding paragraphs and survives splicing into
|
||||
# the middle of any markdown stream.
|
||||
return "\n\n---\n\n" + body + "\n\n---\n\n"
|
||||
|
||||
|
||||
# Patterns we'll try to splice into. Each pattern captures the
|
||||
# original-PDF filename when one is available (group 1).
|
||||
#
|
||||
# Replace-style markers (the matched span is substituted with our block
|
||||
# because it carries no useful content of its own):
|
||||
#
|
||||
# 1. Docling's image placeholder followed by an "Image: <filename>"
|
||||
# caption line. This is what our medxpertqa renderer produces:
|
||||
# reportlab places the JPEG, then a caption, and Docling outputs
|
||||
# the placeholder + caption.
|
||||
# 2. Docling's image placeholder alone (filename unknown -- we fall
|
||||
# back to pypdf's name).
|
||||
# 3. A bare "Image: <filename>" caption line with no preceding
|
||||
# placeholder. Rare in practice, but covers parsers that drop the
|
||||
# placeholder entirely.
|
||||
_PLACEHOLDER_WITH_CAPTION = re.compile(
|
||||
r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_PLACEHOLDER_ONLY = re.compile(
|
||||
r"<!--\s*image\s*-->",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_CAPTION_ONLY = re.compile(
|
||||
r"^[ \t]*Image:\s*(\S+)\s*$",
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Append-after marker (the matched span is preserved verbatim and our
|
||||
# block is inserted immediately after it):
|
||||
#
|
||||
# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
|
||||
# Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
|
||||
# The figure's own contents -- chart bar values, axis labels,
|
||||
# inline ``<figcaption>``, embedded ``<table>`` for tabular figures
|
||||
# -- are themselves specialist OCR output, so we keep them and add
|
||||
# our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
|
||||
# optional attributes like ``<figure id="...">``; ``re.DOTALL``
|
||||
# lets ``.`` cross the newlines inside the block.
|
||||
_FIGURE_BLOCK = re.compile(
|
||||
r"<figure\b[^>]*>.*?</figure>",
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _replace_one_match(
|
||||
markdown: str,
|
||||
pattern: re.Pattern[str],
|
||||
descriptions: list[PictureDescription],
|
||||
desc_idx: int,
|
||||
) -> tuple[str, int]:
|
||||
"""Replace the first occurrence of ``pattern`` with the next image block.
|
||||
|
||||
Returns the new markdown and the new ``desc_idx`` (advanced if a
|
||||
replacement happened, unchanged otherwise).
|
||||
"""
|
||||
|
||||
if desc_idx >= len(descriptions):
|
||||
return markdown, desc_idx
|
||||
|
||||
match = pattern.search(markdown)
|
||||
if not match:
|
||||
return markdown, desc_idx
|
||||
|
||||
desc = descriptions[desc_idx]
|
||||
captured_name: str | None = None
|
||||
if match.groups():
|
||||
captured_name = match.group(1)
|
||||
name = captured_name or desc.name
|
||||
block = _format_image_block(name, desc.description, desc.ocr_text)
|
||||
|
||||
new_markdown = markdown[: match.start()] + block + markdown[match.end():]
|
||||
return new_markdown, desc_idx + 1
|
||||
|
||||
|
||||
def _splice_after_figures(
|
||||
markdown: str,
|
||||
descriptions: list[PictureDescription],
|
||||
desc_idx: int,
|
||||
) -> tuple[str, int]:
|
||||
"""Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
|
||||
|
||||
Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
|
||||
premium) wrap each figure / chart / inline table in this tag and
|
||||
carry their own OCR of the figure's text content inside it. That
|
||||
content is useful on its own, so we keep the original block
|
||||
verbatim and add our vision-LLM block right after it -- giving
|
||||
retrieval both signals in the same chunk.
|
||||
|
||||
Descriptions are matched to figures in document order (first
|
||||
description -> first figure, etc.). All splice points are computed
|
||||
upfront with :func:`re.finditer` and applied in REVERSE order so
|
||||
earlier offsets stay valid as the markdown grows. Returns the
|
||||
advanced ``desc_idx`` for the caller's leftover-handling.
|
||||
"""
|
||||
|
||||
if desc_idx >= len(descriptions):
|
||||
return markdown, desc_idx
|
||||
|
||||
matches = list(_FIGURE_BLOCK.finditer(markdown))
|
||||
if not matches:
|
||||
return markdown, desc_idx
|
||||
|
||||
n_to_splice = min(len(matches), len(descriptions) - desc_idx)
|
||||
if n_to_splice <= 0:
|
||||
return markdown, desc_idx
|
||||
|
||||
out = markdown
|
||||
# Walk in reverse so each splice's end-offset still points at the
|
||||
# right place in the (still-mutating) string.
|
||||
for i in range(n_to_splice - 1, -1, -1):
|
||||
match = matches[i]
|
||||
desc = descriptions[desc_idx + i]
|
||||
block = _format_image_block(desc.name, desc.description, desc.ocr_text)
|
||||
out = out[: match.end()] + block + out[match.end():]
|
||||
|
||||
return out, desc_idx + n_to_splice
|
||||
|
||||
|
||||
def inject_descriptions_inline(
|
||||
markdown: str,
|
||||
result: PictureExtractionResult,
|
||||
) -> tuple[str, int]:
|
||||
"""Splice per-image markdown blocks into the document at image positions.
|
||||
|
||||
Walks the markdown left-to-right, consuming descriptions in order.
|
||||
Tries two splicing strategies, in this order:
|
||||
|
||||
1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
|
||||
layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
|
||||
premium). The figure block carries the parser's own OCR of the
|
||||
figure -- we preserve it and add our vision-LLM block right
|
||||
after.
|
||||
2. **Replace** for Docling-style markers, in priority order:
|
||||
|
||||
- ``<!-- image -->`` followed by ``Image: <filename>`` caption,
|
||||
- ``<!-- image -->`` placeholder alone,
|
||||
- bare ``Image: <filename>`` caption.
|
||||
|
||||
A document typically uses one style or the other (depending on
|
||||
which parser produced its markdown), so the two paths don't fight
|
||||
each other in practice. When they do co-occur, figures are
|
||||
consumed first.
|
||||
|
||||
Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
|
||||
that were placed inline. The caller decides what to do with any
|
||||
leftover descriptions (typically: append them at the end).
|
||||
"""
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown, 0
|
||||
|
||||
descriptions = result.descriptions
|
||||
desc_idx = 0
|
||||
out = markdown
|
||||
|
||||
# Step 1: layout-aware figures. One-shot batch -- finds ALL
|
||||
# <figure> blocks, splices in document order until we exhaust
|
||||
# either side.
|
||||
out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
|
||||
|
||||
# Step 2: Docling-style replacement markers. One match per
|
||||
# iteration, so a doc that has both a figure (consumed above) and
|
||||
# a Docling placeholder (consumed below) still works.
|
||||
while desc_idx < len(descriptions):
|
||||
before_idx = desc_idx
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _PLACEHOLDER_ONLY, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
out, desc_idx = _replace_one_match(
|
||||
out, _CAPTION_ONLY, descriptions, desc_idx
|
||||
)
|
||||
if desc_idx > before_idx:
|
||||
continue
|
||||
# No more positions to splice into.
|
||||
break
|
||||
|
||||
return out, desc_idx
|
||||
|
||||
|
||||
def render_appended_section(
|
||||
descriptions: list[PictureDescription],
|
||||
*,
|
||||
skip_notes: PictureExtractionResult | None = None,
|
||||
heading: str = "## Image Content (vision-LLM extracted)",
|
||||
) -> str:
|
||||
"""Render leftover descriptions as an appended section.
|
||||
|
||||
Used as a fallback when not every description could be inlined
|
||||
(either because the parser produced no detectable image markers,
|
||||
or because there were more extracted images than markers).
|
||||
"""
|
||||
|
||||
if not descriptions and not skip_notes:
|
||||
return ""
|
||||
|
||||
parts: list[str] = ["", heading, ""]
|
||||
for desc in descriptions:
|
||||
parts.append(
|
||||
_format_image_block(desc.name, desc.description, desc.ocr_text)
|
||||
)
|
||||
parts.append("")
|
||||
|
||||
if skip_notes is not None:
|
||||
notes: list[str] = []
|
||||
if skip_notes.skipped_too_large:
|
||||
notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
|
||||
if skip_notes.skipped_too_small:
|
||||
notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
|
||||
if skip_notes.skipped_duplicate:
|
||||
notes.append(f"{skip_notes.skipped_duplicate} duplicate")
|
||||
if skip_notes.failed:
|
||||
notes.append(f"{skip_notes.failed} failed")
|
||||
if notes:
|
||||
parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def merge_descriptions_into_markdown(
|
||||
markdown: str,
|
||||
result: PictureExtractionResult,
|
||||
) -> str:
|
||||
"""Top-level: inline what we can, append what's left over.
|
||||
|
||||
This is the function the ETL pipeline actually calls. It guarantees
|
||||
that no successfully-described image is silently dropped: anything
|
||||
we can't splice inline gets appended at the end with a heading
|
||||
that makes it clear those came from the document but weren't
|
||||
location-matched.
|
||||
"""
|
||||
|
||||
if not result.descriptions:
|
||||
return markdown
|
||||
|
||||
new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
|
||||
leftover = result.descriptions[n_inlined:]
|
||||
|
||||
if not leftover:
|
||||
return new_markdown
|
||||
|
||||
# Distinguish in the heading whether NONE were inlined (parser
|
||||
# produced no markers at all) vs SOME (mismatched count).
|
||||
heading = (
|
||||
"## Image Content (vision-LLM extracted)"
|
||||
if n_inlined == 0
|
||||
else "## Image Content (additional, no inline marker found)"
|
||||
)
|
||||
section = render_appended_section(leftover, heading=heading)
|
||||
if not section:
|
||||
return new_markdown
|
||||
return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PictureDescription",
|
||||
"PictureExtractionResult",
|
||||
"describe_pictures",
|
||||
"inject_descriptions_inline",
|
||||
"merge_descriptions_into_markdown",
|
||||
"render_appended_section",
|
||||
]
|
||||
|
|
@ -77,10 +77,16 @@ class DoclingService:
|
|||
# Create pipeline options with version-safe attribute checking
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
|
||||
# Disable OCR (user request)
|
||||
# Enable OCR so text-in-image (chart axes, ECG annotations,
|
||||
# lab tables embedded as images, scanned pages, etc.) is
|
||||
# lifted into the main markdown stream. This pairs with the
|
||||
# vision-LLM picture-description pass downstream — OCR
|
||||
# captures literal text; vision LLM captures the visual
|
||||
# content. Together they give a faithful representation of
|
||||
# PDFs that mix text and images.
|
||||
if hasattr(pipeline_options, "do_ocr"):
|
||||
pipeline_options.do_ocr = False
|
||||
logger.info("⚠️ OCR disabled by user request")
|
||||
pipeline_options.do_ocr = True
|
||||
logger.info("✅ OCR enabled for embedded text-in-image extraction")
|
||||
else:
|
||||
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
||||
|
||||
|
|
|
|||
|
|
@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
|||
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
|
||||
from app.etl_pipeline.etl_document import EtlRequest
|
||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||
from app.etl_pipeline.file_classifier import (
|
||||
FileCategory,
|
||||
classify_file as etl_classify,
|
||||
)
|
||||
|
||||
await _notify(ctx, "parsing", "Processing file")
|
||||
await ctx.task_logger.log_task_progress(
|
||||
|
|
@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
|||
{"processing_stage": "extracting"},
|
||||
)
|
||||
|
||||
# Fetch the vision LLM whenever the operator opts in. The ETL
|
||||
# pipeline decides what to do with it: image files run through the
|
||||
# vision LLM directly; document files (PDFs) get per-image
|
||||
# descriptions appended via picture_describer.
|
||||
vision_llm = None
|
||||
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
|
||||
if ctx.use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||
|
|
@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
|||
|
||||
await _notify(ctx, "parsing", "Extracting content")
|
||||
|
||||
etl_result = await EtlPipelineService().extract(
|
||||
# Document files (PDF, docx, etc.) get vision LLM treatment too:
|
||||
# the ETL pipeline appends a per-image description section when
|
||||
# vision_llm is provided. See picture_describer.describe_pictures.
|
||||
vision_llm = None
|
||||
if ctx.use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||
|
||||
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||
EtlRequest(
|
||||
file_path=ctx.file_path,
|
||||
filename=ctx.filename,
|
||||
|
|
@ -418,8 +427,12 @@ async def _extract_file_content(
|
|||
billable_pages = estimated_pages * mode.page_multiplier
|
||||
await page_limit_service.check_page_limit(user_id, billable_pages)
|
||||
|
||||
# Vision LLM is provided to the ETL pipeline for any file category
|
||||
# when the operator opts in. Image files run through it directly;
|
||||
# document files (PDFs) get per-image descriptions appended via
|
||||
# picture_describer.
|
||||
vision_llm = None
|
||||
if use_vision_llm and category == FileCategory.IMAGE:
|
||||
if use_vision_llm:
|
||||
from app.services.llm_service import get_vision_llm
|
||||
|
||||
vision_llm = await get_vision_llm(session, search_space_id)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue