refactor: remove allowed_formats from DocumentConverter initialization in DoclingService to allow acceptance of all supported formats

This commit is contained in:
Anish Sarkar 2026-04-06 19:31:42 +05:30
parent caca491774
commit 47f4be08d9
2 changed files with 13 additions and 15 deletions

View file

@ -111,9 +111,7 @@ class DoclingService:
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
) )
# Initialize DocumentConverter with PDF and IMAGE support
self.converter = DocumentConverter( self.converter = DocumentConverter(
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
format_options={InputFormat.PDF: pdf_format_option}, format_options={InputFormat.PDF: pdf_format_option},
) )

View file

@ -1,4 +1,5 @@
"""Test that DoclingService registers InputFormat.IMAGE for image processing.""" """Test that DoclingService does NOT restrict allowed_formats, letting Docling
accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
from enum import Enum from enum import Enum
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
@ -11,11 +12,14 @@ pytestmark = pytest.mark.unit
class _FakeInputFormat(Enum): class _FakeInputFormat(Enum):
PDF = "pdf" PDF = "pdf"
IMAGE = "image" IMAGE = "image"
DOCX = "docx"
PPTX = "pptx"
XLSX = "xlsx"
def test_docling_service_registers_image_format(): def test_docling_service_does_not_restrict_allowed_formats():
"""DoclingService should initialise DocumentConverter with InputFormat.IMAGE """DoclingService should NOT pass allowed_formats to DocumentConverter,
in allowed_formats so that image files (jpg, png, bmp, tiff) are accepted.""" so Docling defaults to accepting every InputFormat it supports."""
mock_converter_cls = MagicMock() mock_converter_cls = MagicMock()
mock_backend = MagicMock() mock_backend = MagicMock()
@ -54,14 +58,10 @@ def test_docling_service_registers_image_format():
assert call_kwargs is not None, "DocumentConverter was never called" assert call_kwargs is not None, "DocumentConverter was never called"
_, kwargs = call_kwargs _, kwargs = call_kwargs
allowed = kwargs.get("allowed_formats") assert "allowed_formats" not in kwargs, (
format_opts = kwargs.get("format_options", {}) f"allowed_formats should not be passed — let Docling accept all formats. "
f"Got: {kwargs.get('allowed_formats')}"
image_registered = (
(allowed is not None and _FakeInputFormat.IMAGE in allowed)
or _FakeInputFormat.IMAGE in format_opts
) )
assert image_registered, ( assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
f"InputFormat.IMAGE not registered. " "format_options should still configure PDF pipeline options"
f"allowed_formats={allowed}, format_options keys={list(format_opts.keys())}"
) )