Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support
using 'unstructured'.

New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.

All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable.  PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.

Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.

New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.

Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
This commit is contained in:
cybermaggedon 2026-03-23 12:56:35 +00:00 committed by GitHub
parent 4609424afe
commit 5c6fe90fe2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 2247 additions and 79 deletions

View file

@ -240,7 +240,7 @@ class TestMistralOcrProcessor(IsolatedAsyncioTestCase):
mock_launch.assert_called_once()
args = mock_launch.call_args[0]
assert args[0] == "pdf-decoder"
assert args[0] == "document-decoder"
assert "Mistral OCR decoder" in args[1]

View file

@ -187,7 +187,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
"""Test run function"""
from trustgraph.decoding.pdf.pdf_decoder import run
run()
mock_launch.assert_called_once_with("pdf-decoder",
mock_launch.assert_called_once_with("document-decoder",
"\nSimple decoder, accepts PDF documents on input, outputs pages from the\nPDF document as text as separate output objects.\n\nSupports both inline document data and fetching from librarian via Pulsar\nfor large documents.\n")

View file

@ -0,0 +1,412 @@
"""
Unit tests for trustgraph.decoding.universal.processor
"""
import pytest
import base64
from unittest.mock import AsyncMock, MagicMock, patch
from unittest import IsolatedAsyncioTestCase
from trustgraph.decoding.universal.processor import (
Processor, assemble_section_text, MIME_EXTENSIONS, PAGE_BASED_FORMATS,
)
from trustgraph.schema import Document, TextDocument, Metadata, Triples
class MockAsyncProcessor:
def __init__(self, **params):
self.config_handlers = []
self.id = params.get('id', 'test-service')
self.specifications = []
self.pubsub = MagicMock()
self.taskgroup = params.get('taskgroup', MagicMock())
def make_element(category="NarrativeText", text="Some text",
page_number=None, text_as_html=None, image_base64=None):
"""Create a mock unstructured element."""
el = MagicMock()
el.category = category
el.text = text
el.metadata = MagicMock()
el.metadata.page_number = page_number
el.metadata.text_as_html = text_as_html
el.metadata.image_base64 = image_base64
return el
class TestAssembleSectionText:
"""Test the text assembly function."""
def test_narrative_text(self):
elements = [
make_element("NarrativeText", "Paragraph one."),
make_element("NarrativeText", "Paragraph two."),
]
text, types, tables, images = assemble_section_text(elements)
assert text == "Paragraph one.\n\nParagraph two."
assert "NarrativeText" in types
assert tables == 0
assert images == 0
def test_table_with_html(self):
elements = [
make_element("NarrativeText", "Before table."),
make_element(
"Table", "Col1 Col2",
text_as_html="<table><tr><td>Col1</td><td>Col2</td></tr></table>"
),
]
text, types, tables, images = assemble_section_text(elements)
assert "<table>" in text
assert "Before table." in text
assert tables == 1
assert "Table" in types
def test_table_without_html_fallback(self):
el = make_element("Table", "plain table text")
el.metadata.text_as_html = None
elements = [el]
text, types, tables, images = assemble_section_text(elements)
assert text == "plain table text"
assert tables == 1
def test_images_skipped(self):
elements = [
make_element("NarrativeText", "Text content"),
make_element("Image", "OCR text from image"),
]
text, types, tables, images = assemble_section_text(elements)
assert "OCR text" not in text
assert "Text content" in text
assert images == 1
assert "Image" in types
def test_empty_elements(self):
text, types, tables, images = assemble_section_text([])
assert text == ""
assert len(types) == 0
assert tables == 0
assert images == 0
def test_mixed_elements(self):
elements = [
make_element("Title", "Section Heading"),
make_element("NarrativeText", "Body text."),
make_element(
"Table", "data",
text_as_html="<table><tr><td>data</td></tr></table>"
),
make_element("Image", "img text"),
make_element("ListItem", "- item one"),
]
text, types, tables, images = assemble_section_text(elements)
assert "Section Heading" in text
assert "Body text." in text
assert "<table>" in text
assert "img text" not in text
assert "- item one" in text
assert tables == 1
assert images == 1
assert {"Title", "NarrativeText", "Table", "Image", "ListItem"} == types
class TestMimeExtensions:
"""Test the mime type to extension mapping."""
def test_pdf_extension(self):
assert MIME_EXTENSIONS["application/pdf"] == ".pdf"
def test_docx_extension(self):
key = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert MIME_EXTENSIONS[key] == ".docx"
def test_html_extension(self):
assert MIME_EXTENSIONS["text/html"] == ".html"
class TestPageBasedFormats:
"""Test page-based format detection."""
def test_pdf_is_page_based(self):
assert "application/pdf" in PAGE_BASED_FORMATS
def test_html_is_not_page_based(self):
assert "text/html" not in PAGE_BASED_FORMATS
def test_pptx_is_page_based(self):
pptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
assert pptx in PAGE_BASED_FORMATS
class TestUniversalProcessor(IsolatedAsyncioTestCase):
"""Test universal decoder processor."""
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_processor_initialization(
self, mock_producer, mock_consumer
):
"""Test processor initialization with defaults."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
assert processor.partition_strategy == "auto"
assert processor.section_strategy_name == "whole-document"
assert processor.section_element_count == 20
assert processor.section_max_size == 4000
# Check specs: input consumer, output producer, triples producer
consumer_specs = [
s for s in processor.specifications if hasattr(s, 'handler')
]
assert len(consumer_specs) >= 1
assert consumer_specs[0].name == "input"
assert consumer_specs[0].schema == Document
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_processor_custom_strategy(
self, mock_producer, mock_consumer
):
"""Test processor initialization with custom section strategy."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
'section_strategy': 'heading',
'strategy': 'hi_res',
}
processor = Processor(**config)
assert processor.partition_strategy == "hi_res"
assert processor.section_strategy_name == "heading"
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_group_by_page(self, mock_producer, mock_consumer):
"""Test page grouping of elements."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
elements = [
make_element("NarrativeText", "Page 1 text", page_number=1),
make_element("NarrativeText", "More page 1", page_number=1),
make_element("NarrativeText", "Page 2 text", page_number=2),
]
result = processor.group_by_page(elements)
assert len(result) == 2
assert result[0][0] == 1 # page number
assert len(result[0][1]) == 2 # 2 elements on page 1
assert result[1][0] == 2
assert len(result[1][1]) == 1
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_on_message_inline_non_page(
self, mock_partition, mock_producer, mock_consumer
):
"""Test processing an inline non-page document."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
# Mock partition to return elements without page numbers
mock_partition.return_value = [
make_element("Title", "Document Title"),
make_element("NarrativeText", "Body text content."),
]
# Mock message with inline data
content = b"# Document Title\nBody text content."
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
# Mock flow
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
# Mock save_child_document and magic
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "text/markdown"
await processor.on_message(mock_msg, None, mock_flow)
# Should emit one section (whole-document strategy)
assert mock_output_flow.send.call_count == 1
assert mock_triples_flow.send.call_count == 1
# Check output
call_args = mock_output_flow.send.call_args[0][0]
assert isinstance(call_args, TextDocument)
assert call_args.document_id.startswith("urn:section:")
assert call_args.text == b""
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_on_message_page_based(
self, mock_partition, mock_producer, mock_consumer
):
"""Test processing a page-based document."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
# Mock partition to return elements with page numbers
mock_partition.return_value = [
make_element("NarrativeText", "Page 1 content", page_number=1),
make_element("NarrativeText", "Page 2 content", page_number=2),
]
# Mock message
content = b"fake pdf"
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "application/pdf"
await processor.on_message(mock_msg, None, mock_flow)
# Should emit two pages
assert mock_output_flow.send.call_count == 2
# Check first output uses page URI
call_args = mock_output_flow.send.call_args_list[0][0][0]
assert call_args.document_id.startswith("urn:page:")
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_images_stored_not_emitted(
self, mock_partition, mock_producer, mock_consumer
):
"""Test that images are stored but not sent to text pipeline."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
mock_partition.return_value = [
make_element("NarrativeText", "Some text", page_number=1),
make_element("Image", "img ocr", page_number=1,
image_base64="aW1hZ2VkYXRh"),
]
content = b"fake pdf"
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "application/pdf"
await processor.on_message(mock_msg, None, mock_flow)
# Only 1 TextDocument output (the page text, not the image)
assert mock_output_flow.send.call_count == 1
# But 2 triples outputs (page provenance + image provenance)
assert mock_triples_flow.send.call_count == 2
# save_child_document called twice (page + image)
assert processor.save_child_document.call_count == 2
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
def test_add_args(self, mock_parent_add_args):
"""Test add_args registers all expected arguments."""
mock_parser = MagicMock()
Processor.add_args(mock_parser)
mock_parent_add_args.assert_called_once_with(mock_parser)
# Check key arguments are registered
arg_names = [
c[0] for c in mock_parser.add_argument.call_args_list
]
assert ('--strategy',) in arg_names
assert ('--languages',) in arg_names
assert ('--section-strategy',) in arg_names
assert ('--section-element-count',) in arg_names
assert ('--section-max-size',) in arg_names
assert ('--section-within-pages',) in arg_names
@patch('trustgraph.decoding.universal.processor.Processor.launch')
def test_run(self, mock_launch):
"""Test run function."""
from trustgraph.decoding.universal.processor import run
run()
mock_launch.assert_called_once()
args = mock_launch.call_args[0]
assert args[0] == "document-decoder"
assert "Universal document decoder" in args[1]
if __name__ == '__main__':
pytest.main([__file__])

View file

@ -0,0 +1,204 @@
"""
Unit tests for universal decoder section grouping strategies.
"""
import pytest
from unittest.mock import MagicMock
from trustgraph.decoding.universal.strategies import (
group_whole_document,
group_by_heading,
group_by_element_type,
group_by_count,
group_by_size,
get_strategy,
STRATEGIES,
)
def make_element(category="NarrativeText", text="Some text"):
"""Create a mock unstructured element."""
el = MagicMock()
el.category = category
el.text = text
return el
class TestGroupWholeDocument:
def test_empty_input(self):
assert group_whole_document([]) == []
def test_returns_single_group(self):
elements = [make_element() for _ in range(5)]
result = group_whole_document(elements)
assert len(result) == 1
assert len(result[0]) == 5
def test_preserves_all_elements(self):
elements = [make_element(text=f"text-{i}") for i in range(3)]
result = group_whole_document(elements)
assert result[0] == elements
class TestGroupByHeading:
def test_empty_input(self):
assert group_by_heading([]) == []
def test_no_headings_falls_back(self):
elements = [make_element("NarrativeText") for _ in range(3)]
result = group_by_heading(elements)
assert len(result) == 1
assert len(result[0]) == 3
def test_splits_at_headings(self):
elements = [
make_element("Title", "Heading 1"),
make_element("NarrativeText", "Paragraph 1"),
make_element("NarrativeText", "Paragraph 2"),
make_element("Title", "Heading 2"),
make_element("NarrativeText", "Paragraph 3"),
]
result = group_by_heading(elements)
assert len(result) == 2
assert len(result[0]) == 3 # Heading 1 + 2 paragraphs
assert len(result[1]) == 2 # Heading 2 + 1 paragraph
def test_leading_content_before_first_heading(self):
elements = [
make_element("NarrativeText", "Preamble"),
make_element("Title", "Heading 1"),
make_element("NarrativeText", "Content"),
]
result = group_by_heading(elements)
assert len(result) == 2
assert len(result[0]) == 1 # Preamble
assert len(result[1]) == 2 # Heading + content
def test_consecutive_headings(self):
elements = [
make_element("Title", "H1"),
make_element("Title", "H2"),
make_element("NarrativeText", "Content"),
]
result = group_by_heading(elements)
assert len(result) == 2
class TestGroupByElementType:
def test_empty_input(self):
assert group_by_element_type([]) == []
def test_all_same_type(self):
elements = [make_element("NarrativeText") for _ in range(3)]
result = group_by_element_type(elements)
assert len(result) == 1
def test_splits_at_table_boundary(self):
elements = [
make_element("NarrativeText", "Intro"),
make_element("NarrativeText", "More text"),
make_element("Table", "Table data"),
make_element("NarrativeText", "After table"),
]
result = group_by_element_type(elements)
assert len(result) == 3
assert len(result[0]) == 2 # Two narrative elements
assert len(result[1]) == 1 # One table
assert len(result[2]) == 1 # One narrative
def test_consecutive_tables_stay_grouped(self):
elements = [
make_element("Table", "Table 1"),
make_element("Table", "Table 2"),
]
result = group_by_element_type(elements)
assert len(result) == 1
assert len(result[0]) == 2
class TestGroupByCount:
def test_empty_input(self):
assert group_by_count([]) == []
def test_exact_multiple(self):
elements = [make_element() for _ in range(6)]
result = group_by_count(elements, element_count=3)
assert len(result) == 2
assert all(len(g) == 3 for g in result)
def test_remainder_group(self):
elements = [make_element() for _ in range(7)]
result = group_by_count(elements, element_count=3)
assert len(result) == 3
assert len(result[0]) == 3
assert len(result[1]) == 3
assert len(result[2]) == 1
def test_fewer_than_count(self):
elements = [make_element() for _ in range(2)]
result = group_by_count(elements, element_count=10)
assert len(result) == 1
assert len(result[0]) == 2
class TestGroupBySize:
def test_empty_input(self):
assert group_by_size([]) == []
def test_small_elements_grouped(self):
elements = [make_element(text="Hi") for _ in range(5)]
result = group_by_size(elements, max_size=100)
assert len(result) == 1
def test_splits_at_size_limit(self):
elements = [make_element(text="x" * 100) for _ in range(5)]
result = group_by_size(elements, max_size=250)
# 2 elements per group (200 chars), then split
assert len(result) == 3
assert len(result[0]) == 2
assert len(result[1]) == 2
assert len(result[2]) == 1
def test_large_element_own_group(self):
elements = [
make_element(text="small"),
make_element(text="x" * 5000), # Exceeds max
make_element(text="small"),
]
result = group_by_size(elements, max_size=100)
assert len(result) == 3
def test_respects_element_boundaries(self):
# Each element is 50 chars, max is 120
# Should get 2 per group, not split mid-element
elements = [make_element(text="x" * 50) for _ in range(5)]
result = group_by_size(elements, max_size=120)
assert len(result) == 3
assert len(result[0]) == 2
assert len(result[1]) == 2
assert len(result[2]) == 1
class TestGetStrategy:
def test_all_strategies_accessible(self):
for name in STRATEGIES:
fn = get_strategy(name)
assert callable(fn)
def test_unknown_strategy_raises(self):
with pytest.raises(ValueError, match="Unknown section strategy"):
get_strategy("nonexistent")
def test_returns_correct_function(self):
assert get_strategy("whole-document") is group_whole_document
assert get_strategy("heading") is group_by_heading
assert get_strategy("element-type") is group_by_element_type
assert get_strategy("count") is group_by_count
assert get_strategy("size") is group_by_size