refactor: remove allowed_formats from DocumentConverter initialization in DoclingService to allow acceptance of all supported formats

This commit is contained in:
Anish Sarkar 2026-04-06 19:31:42 +05:30
parent caca491774
commit 47f4be08d9
2 changed files with 13 additions and 15 deletions

View file

@ -111,9 +111,7 @@ class DoclingService:
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
# Initialize DocumentConverter with PDF and IMAGE support
self.converter = DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
format_options={InputFormat.PDF: pdf_format_option},
)

View file

@ -1,4 +1,5 @@
"""Test that DoclingService registers InputFormat.IMAGE for image processing."""
"""Test that DoclingService does NOT restrict allowed_formats, letting Docling
accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
from enum import Enum
from unittest.mock import MagicMock, patch
@ -11,11 +12,14 @@ pytestmark = pytest.mark.unit
class _FakeInputFormat(Enum):
PDF = "pdf"
IMAGE = "image"
DOCX = "docx"
PPTX = "pptx"
XLSX = "xlsx"
def test_docling_service_registers_image_format():
"""DoclingService should initialise DocumentConverter with InputFormat.IMAGE
in allowed_formats so that image files (jpg, png, bmp, tiff) are accepted."""
def test_docling_service_does_not_restrict_allowed_formats():
"""DoclingService should NOT pass allowed_formats to DocumentConverter,
so Docling defaults to accepting every InputFormat it supports."""
mock_converter_cls = MagicMock()
mock_backend = MagicMock()
@ -54,14 +58,10 @@ def test_docling_service_registers_image_format():
assert call_kwargs is not None, "DocumentConverter was never called"
_, kwargs = call_kwargs
allowed = kwargs.get("allowed_formats")
format_opts = kwargs.get("format_options", {})
image_registered = (
(allowed is not None and _FakeInputFormat.IMAGE in allowed)
or _FakeInputFormat.IMAGE in format_opts
assert "allowed_formats" not in kwargs, (
f"allowed_formats should not be passed — let Docling accept all formats. "
f"Got: {kwargs.get('allowed_formats')}"
)
assert image_registered, (
f"InputFormat.IMAGE not registered. "
f"allowed_formats={allowed}, format_options keys={list(format_opts.keys())}"
assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
"format_options should still configure PDF pipeline options"
)