Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
2026-06-10 15:25:14 +02:00 · 2026-03-23 12:56:35 +00:00 · 2026-03-23 12:56:35 +00:00 · 5c6fe90fe2
commit 5c6fe90fe2
parent 4609424afe
25 changed files with 2247 additions and 79 deletions
--- a/tests/unit/test_decoding/test_mistral_ocr_processor.py
+++ b/tests/unit/test_decoding/test_mistral_ocr_processor.py
@ -240,7 +240,7 @@ class TestMistralOcrProcessor(IsolatedAsyncioTestCase):

        mock_launch.assert_called_once()
        args = mock_launch.call_args[0]
-        assert args[0] == "pdf-decoder"
+        assert args[0] == "document-decoder"
        assert "Mistral OCR decoder" in args[1]


--- a/tests/unit/test_decoding/test_pdf_decoder.py
+++ b/tests/unit/test_decoding/test_pdf_decoder.py
@ -187,7 +187,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        """Test run function"""
        from trustgraph.decoding.pdf.pdf_decoder import run
        run()
-        mock_launch.assert_called_once_with("pdf-decoder",
+        mock_launch.assert_called_once_with("document-decoder",
            "\nSimple decoder, accepts PDF documents on input, outputs pages from the\nPDF document as text as separate output objects.\n\nSupports both inline document data and fetching from librarian via Pulsar\nfor large documents.\n")


--- a/tests/unit/test_decoding/test_universal_processor.py
+++ b/tests/unit/test_decoding/test_universal_processor.py
@ -0,0 +1,412 @@
+"""
+Unit tests for trustgraph.decoding.universal.processor
+"""
+
+import pytest
+import base64
+from unittest.mock import AsyncMock, MagicMock, patch
+from unittest import IsolatedAsyncioTestCase
+
+from trustgraph.decoding.universal.processor import (
+    Processor, assemble_section_text, MIME_EXTENSIONS, PAGE_BASED_FORMATS,
+)
+from trustgraph.schema import Document, TextDocument, Metadata, Triples
+
+
+class MockAsyncProcessor:
+    def __init__(self, **params):
+        self.config_handlers = []
+        self.id = params.get('id', 'test-service')
+        self.specifications = []
+        self.pubsub = MagicMock()
+        self.taskgroup = params.get('taskgroup', MagicMock())
+
+
+def make_element(category="NarrativeText", text="Some text",
+                 page_number=None, text_as_html=None, image_base64=None):
+    """Create a mock unstructured element."""
+    el = MagicMock()
+    el.category = category
+    el.text = text
+    el.metadata = MagicMock()
+    el.metadata.page_number = page_number
+    el.metadata.text_as_html = text_as_html
+    el.metadata.image_base64 = image_base64
+    return el
+
+
+class TestAssembleSectionText:
+    """Test the text assembly function."""
+
+    def test_narrative_text(self):
+        elements = [
+            make_element("NarrativeText", "Paragraph one."),
+            make_element("NarrativeText", "Paragraph two."),
+        ]
+        text, types, tables, images = assemble_section_text(elements)
+        assert text == "Paragraph one.\n\nParagraph two."
+        assert "NarrativeText" in types
+        assert tables == 0
+        assert images == 0
+
+    def test_table_with_html(self):
+        elements = [
+            make_element("NarrativeText", "Before table."),
+            make_element(
+                "Table", "Col1 Col2",
+                text_as_html="<table><tr><td>Col1</td><td>Col2</td></tr></table>"
+            ),
+        ]
+        text, types, tables, images = assemble_section_text(elements)
+        assert "<table>" in text
+        assert "Before table." in text
+        assert tables == 1
+        assert "Table" in types
+
+    def test_table_without_html_fallback(self):
+        el = make_element("Table", "plain table text")
+        el.metadata.text_as_html = None
+        elements = [el]
+        text, types, tables, images = assemble_section_text(elements)
+        assert text == "plain table text"
+        assert tables == 1
+
+    def test_images_skipped(self):
+        elements = [
+            make_element("NarrativeText", "Text content"),
+            make_element("Image", "OCR text from image"),
+        ]
+        text, types, tables, images = assemble_section_text(elements)
+        assert "OCR text" not in text
+        assert "Text content" in text
+        assert images == 1
+        assert "Image" in types
+
+    def test_empty_elements(self):
+        text, types, tables, images = assemble_section_text([])
+        assert text == ""
+        assert len(types) == 0
+        assert tables == 0
+        assert images == 0
+
+    def test_mixed_elements(self):
+        elements = [
+            make_element("Title", "Section Heading"),
+            make_element("NarrativeText", "Body text."),
+            make_element(
+                "Table", "data",
+                text_as_html="<table><tr><td>data</td></tr></table>"
+            ),
+            make_element("Image", "img text"),
+            make_element("ListItem", "- item one"),
+        ]
+        text, types, tables, images = assemble_section_text(elements)
+        assert "Section Heading" in text
+        assert "Body text." in text
+        assert "<table>" in text
+        assert "img text" not in text
+        assert "- item one" in text
+        assert tables == 1
+        assert images == 1
+        assert {"Title", "NarrativeText", "Table", "Image", "ListItem"} == types
+
+
+class TestMimeExtensions:
+    """Test the mime type to extension mapping."""
+
+    def test_pdf_extension(self):
+        assert MIME_EXTENSIONS["application/pdf"] == ".pdf"
+
+    def test_docx_extension(self):
+        key = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        assert MIME_EXTENSIONS[key] == ".docx"
+
+    def test_html_extension(self):
+        assert MIME_EXTENSIONS["text/html"] == ".html"
+
+
+class TestPageBasedFormats:
+    """Test page-based format detection."""
+
+    def test_pdf_is_page_based(self):
+        assert "application/pdf" in PAGE_BASED_FORMATS
+
+    def test_html_is_not_page_based(self):
+        assert "text/html" not in PAGE_BASED_FORMATS
+
+    def test_pptx_is_page_based(self):
+        pptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        assert pptx in PAGE_BASED_FORMATS
+
+
+class TestUniversalProcessor(IsolatedAsyncioTestCase):
+    """Test universal decoder processor."""
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_processor_initialization(
+        self, mock_producer, mock_consumer
+    ):
+        """Test processor initialization with defaults."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+        }
+
+        processor = Processor(**config)
+
+        assert processor.partition_strategy == "auto"
+        assert processor.section_strategy_name == "whole-document"
+        assert processor.section_element_count == 20
+        assert processor.section_max_size == 4000
+
+        # Check specs: input consumer, output producer, triples producer
+        consumer_specs = [
+            s for s in processor.specifications if hasattr(s, 'handler')
+        ]
+        assert len(consumer_specs) >= 1
+        assert consumer_specs[0].name == "input"
+        assert consumer_specs[0].schema == Document
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_processor_custom_strategy(
+        self, mock_producer, mock_consumer
+    ):
+        """Test processor initialization with custom section strategy."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+            'section_strategy': 'heading',
+            'strategy': 'hi_res',
+        }
+
+        processor = Processor(**config)
+
+        assert processor.partition_strategy == "hi_res"
+        assert processor.section_strategy_name == "heading"
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_group_by_page(self, mock_producer, mock_consumer):
+        """Test page grouping of elements."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+        }
+
+        processor = Processor(**config)
+
+        elements = [
+            make_element("NarrativeText", "Page 1 text", page_number=1),
+            make_element("NarrativeText", "More page 1", page_number=1),
+            make_element("NarrativeText", "Page 2 text", page_number=2),
+        ]
+
+        result = processor.group_by_page(elements)
+
+        assert len(result) == 2
+        assert result[0][0] == 1  # page number
+        assert len(result[0][1]) == 2  # 2 elements on page 1
+        assert result[1][0] == 2
+        assert len(result[1][1]) == 1
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.decoding.universal.processor.partition')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_on_message_inline_non_page(
+        self, mock_partition, mock_producer, mock_consumer
+    ):
+        """Test processing an inline non-page document."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+        }
+
+        processor = Processor(**config)
+
+        # Mock partition to return elements without page numbers
+        mock_partition.return_value = [
+            make_element("Title", "Document Title"),
+            make_element("NarrativeText", "Body text content."),
+        ]
+
+        # Mock message with inline data
+        content = b"# Document Title\nBody text content."
+        mock_metadata = Metadata(id="test-doc", user="testuser",
+                                 collection="default")
+        mock_document = Document(
+            metadata=mock_metadata,
+            data=base64.b64encode(content).decode('utf-8'),
+        )
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = mock_document
+
+        # Mock flow
+        mock_output_flow = AsyncMock()
+        mock_triples_flow = AsyncMock()
+        mock_flow = MagicMock(side_effect=lambda name: {
+            "output": mock_output_flow,
+            "triples": mock_triples_flow,
+        }.get(name))
+
+        # Mock save_child_document and magic
+        processor.save_child_document = AsyncMock(return_value="mock-id")
+
+        with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
+            mock_magic.from_buffer.return_value = "text/markdown"
+            await processor.on_message(mock_msg, None, mock_flow)
+
+        # Should emit one section (whole-document strategy)
+        assert mock_output_flow.send.call_count == 1
+        assert mock_triples_flow.send.call_count == 1
+
+        # Check output
+        call_args = mock_output_flow.send.call_args[0][0]
+        assert isinstance(call_args, TextDocument)
+        assert call_args.document_id.startswith("urn:section:")
+        assert call_args.text == b""
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.decoding.universal.processor.partition')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_on_message_page_based(
+        self, mock_partition, mock_producer, mock_consumer
+    ):
+        """Test processing a page-based document."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+        }
+
+        processor = Processor(**config)
+
+        # Mock partition to return elements with page numbers
+        mock_partition.return_value = [
+            make_element("NarrativeText", "Page 1 content", page_number=1),
+            make_element("NarrativeText", "Page 2 content", page_number=2),
+        ]
+
+        # Mock message
+        content = b"fake pdf"
+        mock_metadata = Metadata(id="test-doc", user="testuser",
+                                 collection="default")
+        mock_document = Document(
+            metadata=mock_metadata,
+            data=base64.b64encode(content).decode('utf-8'),
+        )
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = mock_document
+
+        mock_output_flow = AsyncMock()
+        mock_triples_flow = AsyncMock()
+        mock_flow = MagicMock(side_effect=lambda name: {
+            "output": mock_output_flow,
+            "triples": mock_triples_flow,
+        }.get(name))
+
+        processor.save_child_document = AsyncMock(return_value="mock-id")
+
+        with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
+            mock_magic.from_buffer.return_value = "application/pdf"
+            await processor.on_message(mock_msg, None, mock_flow)
+
+        # Should emit two pages
+        assert mock_output_flow.send.call_count == 2
+
+        # Check first output uses page URI
+        call_args = mock_output_flow.send.call_args_list[0][0][0]
+        assert call_args.document_id.startswith("urn:page:")
+
+    @patch('trustgraph.decoding.universal.processor.Consumer')
+    @patch('trustgraph.decoding.universal.processor.Producer')
+    @patch('trustgraph.decoding.universal.processor.partition')
+    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
+    async def test_images_stored_not_emitted(
+        self, mock_partition, mock_producer, mock_consumer
+    ):
+        """Test that images are stored but not sent to text pipeline."""
+        config = {
+            'id': 'test-universal',
+            'taskgroup': AsyncMock(),
+        }
+
+        processor = Processor(**config)
+
+        mock_partition.return_value = [
+            make_element("NarrativeText", "Some text", page_number=1),
+            make_element("Image", "img ocr", page_number=1,
+                         image_base64="aW1hZ2VkYXRh"),
+        ]
+
+        content = b"fake pdf"
+        mock_metadata = Metadata(id="test-doc", user="testuser",
+                                 collection="default")
+        mock_document = Document(
+            metadata=mock_metadata,
+            data=base64.b64encode(content).decode('utf-8'),
+        )
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = mock_document
+
+        mock_output_flow = AsyncMock()
+        mock_triples_flow = AsyncMock()
+        mock_flow = MagicMock(side_effect=lambda name: {
+            "output": mock_output_flow,
+            "triples": mock_triples_flow,
+        }.get(name))
+
+        processor.save_child_document = AsyncMock(return_value="mock-id")
+
+        with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
+            mock_magic.from_buffer.return_value = "application/pdf"
+            await processor.on_message(mock_msg, None, mock_flow)
+
+        # Only 1 TextDocument output (the page text, not the image)
+        assert mock_output_flow.send.call_count == 1
+
+        # But 2 triples outputs (page provenance + image provenance)
+        assert mock_triples_flow.send.call_count == 2
+
+        # save_child_document called twice (page + image)
+        assert processor.save_child_document.call_count == 2
+
+    @patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
+    def test_add_args(self, mock_parent_add_args):
+        """Test add_args registers all expected arguments."""
+        mock_parser = MagicMock()
+
+        Processor.add_args(mock_parser)
+
+        mock_parent_add_args.assert_called_once_with(mock_parser)
+
+        # Check key arguments are registered
+        arg_names = [
+            c[0] for c in mock_parser.add_argument.call_args_list
+        ]
+        assert ('--strategy',) in arg_names
+        assert ('--languages',) in arg_names
+        assert ('--section-strategy',) in arg_names
+        assert ('--section-element-count',) in arg_names
+        assert ('--section-max-size',) in arg_names
+        assert ('--section-within-pages',) in arg_names
+
+    @patch('trustgraph.decoding.universal.processor.Processor.launch')
+    def test_run(self, mock_launch):
+        """Test run function."""
+        from trustgraph.decoding.universal.processor import run
+        run()
+
+        mock_launch.assert_called_once()
+        args = mock_launch.call_args[0]
+        assert args[0] == "document-decoder"
+        assert "Universal document decoder" in args[1]
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
--- a/tests/unit/test_decoding/test_universal_strategies.py
+++ b/tests/unit/test_decoding/test_universal_strategies.py
@ -0,0 +1,204 @@
+"""
+Unit tests for universal decoder section grouping strategies.
+"""
+
+import pytest
+from unittest.mock import MagicMock
+
+
+from trustgraph.decoding.universal.strategies import (
+    group_whole_document,
+    group_by_heading,
+    group_by_element_type,
+    group_by_count,
+    group_by_size,
+    get_strategy,
+    STRATEGIES,
+)
+
+
+def make_element(category="NarrativeText", text="Some text"):
+    """Create a mock unstructured element."""
+    el = MagicMock()
+    el.category = category
+    el.text = text
+    return el
+
+
+class TestGroupWholeDocument:
+
+    def test_empty_input(self):
+        assert group_whole_document([]) == []
+
+    def test_returns_single_group(self):
+        elements = [make_element() for _ in range(5)]
+        result = group_whole_document(elements)
+        assert len(result) == 1
+        assert len(result[0]) == 5
+
+    def test_preserves_all_elements(self):
+        elements = [make_element(text=f"text-{i}") for i in range(3)]
+        result = group_whole_document(elements)
+        assert result[0] == elements
+
+
+class TestGroupByHeading:
+
+    def test_empty_input(self):
+        assert group_by_heading([]) == []
+
+    def test_no_headings_falls_back(self):
+        elements = [make_element("NarrativeText") for _ in range(3)]
+        result = group_by_heading(elements)
+        assert len(result) == 1
+        assert len(result[0]) == 3
+
+    def test_splits_at_headings(self):
+        elements = [
+            make_element("Title", "Heading 1"),
+            make_element("NarrativeText", "Paragraph 1"),
+            make_element("NarrativeText", "Paragraph 2"),
+            make_element("Title", "Heading 2"),
+            make_element("NarrativeText", "Paragraph 3"),
+        ]
+        result = group_by_heading(elements)
+        assert len(result) == 2
+        assert len(result[0]) == 3  # Heading 1 + 2 paragraphs
+        assert len(result[1]) == 2  # Heading 2 + 1 paragraph
+
+    def test_leading_content_before_first_heading(self):
+        elements = [
+            make_element("NarrativeText", "Preamble"),
+            make_element("Title", "Heading 1"),
+            make_element("NarrativeText", "Content"),
+        ]
+        result = group_by_heading(elements)
+        assert len(result) == 2
+        assert len(result[0]) == 1  # Preamble
+        assert len(result[1]) == 2  # Heading + content
+
+    def test_consecutive_headings(self):
+        elements = [
+            make_element("Title", "H1"),
+            make_element("Title", "H2"),
+            make_element("NarrativeText", "Content"),
+        ]
+        result = group_by_heading(elements)
+        assert len(result) == 2
+
+
+class TestGroupByElementType:
+
+    def test_empty_input(self):
+        assert group_by_element_type([]) == []
+
+    def test_all_same_type(self):
+        elements = [make_element("NarrativeText") for _ in range(3)]
+        result = group_by_element_type(elements)
+        assert len(result) == 1
+
+    def test_splits_at_table_boundary(self):
+        elements = [
+            make_element("NarrativeText", "Intro"),
+            make_element("NarrativeText", "More text"),
+            make_element("Table", "Table data"),
+            make_element("NarrativeText", "After table"),
+        ]
+        result = group_by_element_type(elements)
+        assert len(result) == 3
+        assert len(result[0]) == 2  # Two narrative elements
+        assert len(result[1]) == 1  # One table
+        assert len(result[2]) == 1  # One narrative
+
+    def test_consecutive_tables_stay_grouped(self):
+        elements = [
+            make_element("Table", "Table 1"),
+            make_element("Table", "Table 2"),
+        ]
+        result = group_by_element_type(elements)
+        assert len(result) == 1
+        assert len(result[0]) == 2
+
+
+class TestGroupByCount:
+
+    def test_empty_input(self):
+        assert group_by_count([]) == []
+
+    def test_exact_multiple(self):
+        elements = [make_element() for _ in range(6)]
+        result = group_by_count(elements, element_count=3)
+        assert len(result) == 2
+        assert all(len(g) == 3 for g in result)
+
+    def test_remainder_group(self):
+        elements = [make_element() for _ in range(7)]
+        result = group_by_count(elements, element_count=3)
+        assert len(result) == 3
+        assert len(result[0]) == 3
+        assert len(result[1]) == 3
+        assert len(result[2]) == 1
+
+    def test_fewer_than_count(self):
+        elements = [make_element() for _ in range(2)]
+        result = group_by_count(elements, element_count=10)
+        assert len(result) == 1
+        assert len(result[0]) == 2
+
+
+class TestGroupBySize:
+
+    def test_empty_input(self):
+        assert group_by_size([]) == []
+
+    def test_small_elements_grouped(self):
+        elements = [make_element(text="Hi") for _ in range(5)]
+        result = group_by_size(elements, max_size=100)
+        assert len(result) == 1
+
+    def test_splits_at_size_limit(self):
+        elements = [make_element(text="x" * 100) for _ in range(5)]
+        result = group_by_size(elements, max_size=250)
+        # 2 elements per group (200 chars), then split
+        assert len(result) == 3
+        assert len(result[0]) == 2
+        assert len(result[1]) == 2
+        assert len(result[2]) == 1
+
+    def test_large_element_own_group(self):
+        elements = [
+            make_element(text="small"),
+            make_element(text="x" * 5000),  # Exceeds max
+            make_element(text="small"),
+        ]
+        result = group_by_size(elements, max_size=100)
+        assert len(result) == 3
+
+    def test_respects_element_boundaries(self):
+        # Each element is 50 chars, max is 120
+        # Should get 2 per group, not split mid-element
+        elements = [make_element(text="x" * 50) for _ in range(5)]
+        result = group_by_size(elements, max_size=120)
+        assert len(result) == 3
+        assert len(result[0]) == 2
+        assert len(result[1]) == 2
+        assert len(result[2]) == 1
+
+
+class TestGetStrategy:
+
+    def test_all_strategies_accessible(self):
+        for name in STRATEGIES:
+            fn = get_strategy(name)
+            assert callable(fn)
+
+    def test_unknown_strategy_raises(self):
+        with pytest.raises(ValueError, match="Unknown section strategy"):
+            get_strategy("nonexistent")
+
+    def test_returns_correct_function(self):
+        assert get_strategy("whole-document") is group_whole_document
+        assert get_strategy("heading") is group_by_heading
+        assert get_strategy("element-type") is group_by_element_type
+        assert get_strategy("count") is group_by_count
+        assert get_strategy("size") is group_by_size