Route uploaded images to vision LLM with document-parser fallback

This commit is contained in:
CREDO23 2026-04-09 14:33:33 +02:00
parent 78fa2d926a
commit 7e90a8ed3c
7 changed files with 199 additions and 5 deletions

View file

@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
class EtlPipelineService:
"""Single pipeline for extracting markdown from files. All callers use this."""
def __init__(self, *, vision_llm=None):
self._vision_llm = vision_llm
async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename)
@ -47,6 +50,28 @@ class EtlPipelineService:
content_type="audio",
)
if category == FileCategory.IMAGE:
return await self._extract_image(request)
return await self._extract_document(request)
async def _extract_image(self, request: EtlRequest) -> EtlResult:
if self._vision_llm:
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
content = await parse_with_vision_llm(
request.file_path, request.filename, self._vision_llm
)
return EtlResult(
markdown_content=content,
etl_service="VISION_LLM",
content_type="image",
)
logging.info(
"No vision LLM provided, falling back to document parser for %s",
request.filename,
)
return await self._extract_document(request)
async def _extract_document(self, request: EtlRequest) -> EtlResult:

View file

@ -3,6 +3,7 @@ from pathlib import PurePosixPath
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
get_document_extensions_for_service,
)
@ -105,6 +106,7 @@ class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
IMAGE = "image"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
if suffix in IMAGE_EXTENSIONS:
return FileCategory.IMAGE
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Document files are checked against the per-parser extension set.
skipped. Image and document files are checked against the per-parser
extension set (images fall back to the document parser when no vision LLM
is available, so the same service constraint applies).
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category == FileCategory.DOCUMENT:
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False

View file

@ -0,0 +1,37 @@
import base64
import mimetypes
from langchain_core.messages import HumanMessage
_PROMPT = (
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
"Include:\n"
"- All visible text, transcribed verbatim\n"
"- Description of diagrams, charts, tables, or visual structures\n"
"- Key subjects, objects, or scenes depicted\n\n"
"Output only the markdown content, no preamble."
)
def _image_to_data_url(file_path: str) -> str:
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type or not mime_type.startswith("image/"):
mime_type = "image/png"
with open(file_path, "rb") as f:
encoded = base64.b64encode(f.read()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
response = await llm.ainvoke([message])
text = response.content if hasattr(response, "content") else str(response)
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()

View file

@ -333,6 +333,7 @@ async def process_file_in_background(
async def _extract_file_content(
file_path: str,
filename: str,
search_space_id: int,
session: AsyncSession,
user_id: str,
task_logger: TaskLoggingService,
@ -360,6 +361,7 @@ async def _extract_file_content(
FileCategory.PLAINTEXT: "Reading file",
FileCategory.DIRECT_CONVERT: "Converting file",
FileCategory.AUDIO: "Transcribing audio",
FileCategory.IMAGE: "Analyzing image",
FileCategory.UNSUPPORTED: "Unsupported file type",
FileCategory.DOCUMENT: "Extracting content",
}
@ -383,7 +385,13 @@ async def _extract_file_content(
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
await page_limit_service.check_page_limit(user_id, estimated_pages)
result = await EtlPipelineService().extract(
vision_llm = None
if category == FileCategory.IMAGE:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(
file_path=file_path,
filename=filename,
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
markdown_content, etl_service = await _extract_file_content(
file_path,
filename,
search_space_id,
session,
user_id,
task_logger,

View file

@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
Unstructured).
Image extensions intentionally remain in the per-parser sets for fallback
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
"""
from pathlib import PurePosixPath
# ---------------------------------------------------------------------------
# Image extensions (used by file_classifier for routing to vision LLM)
# ---------------------------------------------------------------------------
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
{
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".heic",
".heif",
}
)
# ---------------------------------------------------------------------------
# Per-parser document extension sets (from official documentation)
# ---------------------------------------------------------------------------

View file

@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
("doc.docx", "document"),
("slides.pptx", "document"),
("sheet.xlsx", "document"),
("photo.png", "document"),
("photo.jpg", "document"),
("photo.png", "image"),
("photo.jpg", "image"),
("photo.webp", "image"),
("photo.gif", "image"),
("photo.heic", "image"),
("book.epub", "document"),
("letter.odt", "document"),
("readme.md", "plaintext"),
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
await EtlPipelineService().extract(
EtlRequest(file_path=str(eml_file), filename="mail.eml")
)
# ---------------------------------------------------------------------------
# Image extraction via vision LLM
# ---------------------------------------------------------------------------
async def test_extract_image_with_vision_llm(tmp_path):
"""An image file is analyzed by the vision LLM when provided."""
from unittest.mock import AsyncMock, MagicMock
img_file = tmp_path / "photo.png"
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
fake_response = MagicMock()
fake_response.content = "# A photo of a sunset over the ocean"
fake_llm = AsyncMock()
fake_llm.ainvoke.return_value = fake_response
service = EtlPipelineService(vision_llm=fake_llm)
result = await service.extract(
EtlRequest(file_path=str(img_file), filename="photo.png")
)
assert result.markdown_content == "# A photo of a sunset over the ocean"
assert result.etl_service == "VISION_LLM"
assert result.content_type == "image"
fake_llm.ainvoke.assert_called_once()
async def test_extract_image_falls_back_to_document_without_vision_llm(
tmp_path, mocker
):
"""Without a vision LLM, image files fall back to the document parser."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# OCR text from image"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
img_file = tmp_path / "scan.png"
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
service = EtlPipelineService()
result = await service.extract(
EtlRequest(file_path=str(img_file), filename="scan.png")
)
assert result.markdown_content == "# OCR text from image"
assert result.etl_service == "DOCLING"
assert result.content_type == "document"

View file

@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
# ---------------------------------------------------------------------------
# IMAGE_EXTENSIONS
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"ext",
[
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".heic",
".heif",
],
)
def test_image_extensions_contains_expected(ext):
from app.utils.file_extensions import IMAGE_EXTENSIONS
assert ext in IMAGE_EXTENSIONS
def test_image_extensions_are_subset_of_document_extensions():
"""Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
assert not missing, (
f"Image extensions missing from document sets (breaks fallback): {missing}"
)