Route uploaded images to vision LLM with document-parser fallback

This commit is contained in:
CREDO23 2026-04-09 14:33:33 +02:00
parent 78fa2d926a
commit 7e90a8ed3c
7 changed files with 199 additions and 5 deletions

View file

@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
class EtlPipelineService:
"""Single pipeline for extracting markdown from files. All callers use this."""
def __init__(self, *, vision_llm=None):
self._vision_llm = vision_llm
async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename)
@ -47,6 +50,28 @@ class EtlPipelineService:
content_type="audio",
)
if category == FileCategory.IMAGE:
return await self._extract_image(request)
return await self._extract_document(request)
async def _extract_image(self, request: EtlRequest) -> EtlResult:
if self._vision_llm:
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
content = await parse_with_vision_llm(
request.file_path, request.filename, self._vision_llm
)
return EtlResult(
markdown_content=content,
etl_service="VISION_LLM",
content_type="image",
)
logging.info(
"No vision LLM provided, falling back to document parser for %s",
request.filename,
)
return await self._extract_document(request)
async def _extract_document(self, request: EtlRequest) -> EtlResult:

View file

@ -3,6 +3,7 @@ from pathlib import PurePosixPath
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
get_document_extensions_for_service,
)
@ -105,6 +106,7 @@ class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
IMAGE = "image"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
if suffix in IMAGE_EXTENSIONS:
return FileCategory.IMAGE
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Document files are checked against the per-parser extension set.
skipped. Image and document files are checked against the per-parser
extension set (images fall back to the document parser when no vision LLM
is available, so the same service constraint applies).
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category == FileCategory.DOCUMENT:
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False

View file

@ -0,0 +1,37 @@
import base64
import mimetypes
from langchain_core.messages import HumanMessage
_PROMPT = (
"Analyze this image thoroughly and produce a detailed markdown description.\n\n"
"Include:\n"
"- All visible text, transcribed verbatim\n"
"- Description of diagrams, charts, tables, or visual structures\n"
"- Key subjects, objects, or scenes depicted\n\n"
"Output only the markdown content, no preamble."
)
def _image_to_data_url(file_path: str) -> str:
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type or not mime_type.startswith("image/"):
mime_type = "image/png"
with open(file_path, "rb") as f:
encoded = base64.b64encode(f.read()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
data_url = _image_to_data_url(file_path)
message = HumanMessage(
content=[
{"type": "text", "text": _PROMPT},
{"type": "image_url", "image_url": {"url": data_url}},
]
)
response = await llm.ainvoke([message])
text = response.content if hasattr(response, "content") else str(response)
if not text or not text.strip():
raise ValueError(f"Vision LLM returned empty content for {filename}")
return text.strip()

View file

@ -333,6 +333,7 @@ async def process_file_in_background(
async def _extract_file_content(
file_path: str,
filename: str,
search_space_id: int,
session: AsyncSession,
user_id: str,
task_logger: TaskLoggingService,
@ -360,6 +361,7 @@ async def _extract_file_content(
FileCategory.PLAINTEXT: "Reading file",
FileCategory.DIRECT_CONVERT: "Converting file",
FileCategory.AUDIO: "Transcribing audio",
FileCategory.IMAGE: "Analyzing image",
FileCategory.UNSUPPORTED: "Unsupported file type",
FileCategory.DOCUMENT: "Extracting content",
}
@ -383,7 +385,13 @@ async def _extract_file_content(
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
await page_limit_service.check_page_limit(user_id, estimated_pages)
result = await EtlPipelineService().extract(
vision_llm = None
if category == FileCategory.IMAGE:
from app.services.llm_service import get_vision_llm
vision_llm = await get_vision_llm(session, search_space_id)
result = await EtlPipelineService(vision_llm=vision_llm).extract(
EtlRequest(
file_path=file_path,
filename=filename,
@ -439,6 +447,7 @@ async def process_file_in_background_with_document(
markdown_content, etl_service = await _extract_file_content(
file_path,
filename,
search_space_id,
session,
user_id,
task_logger,

View file

@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
Unstructured).
Image extensions intentionally remain in the per-parser sets for fallback
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
"""
from pathlib import PurePosixPath
# ---------------------------------------------------------------------------
# Image extensions (used by file_classifier for routing to vision LLM)
# ---------------------------------------------------------------------------
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
{
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".heic",
".heif",
}
)
# ---------------------------------------------------------------------------
# Per-parser document extension sets (from official documentation)
# ---------------------------------------------------------------------------