mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-02 12:22:40 +02:00
refactor: remove allowed_formats from DocumentConverter initialization in DoclingService to allow acceptance of all supported formats
This commit is contained in:
parent
caca491774
commit
47f4be08d9
2 changed files with 13 additions and 15 deletions
|
|
@ -111,9 +111,7 @@ class DoclingService:
|
||||||
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize DocumentConverter with PDF and IMAGE support
|
|
||||||
self.converter = DocumentConverter(
|
self.converter = DocumentConverter(
|
||||||
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE],
|
|
||||||
format_options={InputFormat.PDF: pdf_format_option},
|
format_options={InputFormat.PDF: pdf_format_option},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
"""Test that DoclingService registers InputFormat.IMAGE for image processing."""
|
"""Test that DoclingService does NOT restrict allowed_formats, letting Docling
|
||||||
|
accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
@ -11,11 +12,14 @@ pytestmark = pytest.mark.unit
|
||||||
class _FakeInputFormat(Enum):
|
class _FakeInputFormat(Enum):
|
||||||
PDF = "pdf"
|
PDF = "pdf"
|
||||||
IMAGE = "image"
|
IMAGE = "image"
|
||||||
|
DOCX = "docx"
|
||||||
|
PPTX = "pptx"
|
||||||
|
XLSX = "xlsx"
|
||||||
|
|
||||||
|
|
||||||
def test_docling_service_registers_image_format():
|
def test_docling_service_does_not_restrict_allowed_formats():
|
||||||
"""DoclingService should initialise DocumentConverter with InputFormat.IMAGE
|
"""DoclingService should NOT pass allowed_formats to DocumentConverter,
|
||||||
in allowed_formats so that image files (jpg, png, bmp, tiff) are accepted."""
|
so Docling defaults to accepting every InputFormat it supports."""
|
||||||
|
|
||||||
mock_converter_cls = MagicMock()
|
mock_converter_cls = MagicMock()
|
||||||
mock_backend = MagicMock()
|
mock_backend = MagicMock()
|
||||||
|
|
@ -54,14 +58,10 @@ def test_docling_service_registers_image_format():
|
||||||
assert call_kwargs is not None, "DocumentConverter was never called"
|
assert call_kwargs is not None, "DocumentConverter was never called"
|
||||||
|
|
||||||
_, kwargs = call_kwargs
|
_, kwargs = call_kwargs
|
||||||
allowed = kwargs.get("allowed_formats")
|
assert "allowed_formats" not in kwargs, (
|
||||||
format_opts = kwargs.get("format_options", {})
|
f"allowed_formats should not be passed — let Docling accept all formats. "
|
||||||
|
f"Got: {kwargs.get('allowed_formats')}"
|
||||||
image_registered = (
|
|
||||||
(allowed is not None and _FakeInputFormat.IMAGE in allowed)
|
|
||||||
or _FakeInputFormat.IMAGE in format_opts
|
|
||||||
)
|
)
|
||||||
assert image_registered, (
|
assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
|
||||||
f"InputFormat.IMAGE not registered. "
|
"format_options should still configure PDF pipeline options"
|
||||||
f"allowed_formats={allowed}, format_options keys={list(format_opts.keys())}"
|
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue