"""
Unit tests for trustgraph.decoding.universal.processor
"""
import pytest
import base64
from unittest.mock import AsyncMock, MagicMock, patch
from unittest import IsolatedAsyncioTestCase
from trustgraph.decoding.universal.processor import (
Processor, assemble_section_text, MIME_EXTENSIONS, PAGE_BASED_FORMATS,
)
from trustgraph.schema import Document, TextDocument, Metadata, Triples
class MockAsyncProcessor:
def __init__(self, **params):
self.config_handlers = []
self.id = params.get('id', 'test-service')
self.specifications = []
self.pubsub = MagicMock()
self.taskgroup = params.get('taskgroup', MagicMock())
def make_element(category="NarrativeText", text="Some text",
page_number=None, text_as_html=None, image_base64=None):
"""Create a mock unstructured element."""
el = MagicMock()
el.category = category
el.text = text
el.metadata = MagicMock()
el.metadata.page_number = page_number
el.metadata.text_as_html = text_as_html
el.metadata.image_base64 = image_base64
return el
class TestAssembleSectionText:
"""Test the text assembly function."""
def test_narrative_text(self):
elements = [
make_element("NarrativeText", "Paragraph one."),
make_element("NarrativeText", "Paragraph two."),
]
text, types, tables, images = assemble_section_text(elements)
assert text == "Paragraph one.\n\nParagraph two."
assert "NarrativeText" in types
assert tables == 0
assert images == 0
def test_table_with_html(self):
elements = [
make_element("NarrativeText", "Before table."),
make_element(
"Table", "Col1 Col2",
text_as_html="
"
),
]
text, types, tables, images = assemble_section_text(elements)
assert "" in text
assert "Before table." in text
assert tables == 1
assert "Table" in types
def test_table_without_html_fallback(self):
el = make_element("Table", "plain table text")
el.metadata.text_as_html = None
elements = [el]
text, types, tables, images = assemble_section_text(elements)
assert text == "plain table text"
assert tables == 1
def test_images_skipped(self):
elements = [
make_element("NarrativeText", "Text content"),
make_element("Image", "OCR text from image"),
]
text, types, tables, images = assemble_section_text(elements)
assert "OCR text" not in text
assert "Text content" in text
assert images == 1
assert "Image" in types
def test_empty_elements(self):
text, types, tables, images = assemble_section_text([])
assert text == ""
assert len(types) == 0
assert tables == 0
assert images == 0
def test_mixed_elements(self):
elements = [
make_element("Title", "Section Heading"),
make_element("NarrativeText", "Body text."),
make_element(
"Table", "data",
text_as_html=""
),
make_element("Image", "img text"),
make_element("ListItem", "- item one"),
]
text, types, tables, images = assemble_section_text(elements)
assert "Section Heading" in text
assert "Body text." in text
assert "" in text
assert "img text" not in text
assert "- item one" in text
assert tables == 1
assert images == 1
assert {"Title", "NarrativeText", "Table", "Image", "ListItem"} == types
class TestMimeExtensions:
"""Test the mime type to extension mapping."""
def test_pdf_extension(self):
assert MIME_EXTENSIONS["application/pdf"] == ".pdf"
def test_docx_extension(self):
key = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert MIME_EXTENSIONS[key] == ".docx"
def test_html_extension(self):
assert MIME_EXTENSIONS["text/html"] == ".html"
class TestPageBasedFormats:
"""Test page-based format detection."""
def test_pdf_is_page_based(self):
assert "application/pdf" in PAGE_BASED_FORMATS
def test_html_is_not_page_based(self):
assert "text/html" not in PAGE_BASED_FORMATS
def test_pptx_is_page_based(self):
pptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
assert pptx in PAGE_BASED_FORMATS
class TestUniversalProcessor(IsolatedAsyncioTestCase):
"""Test universal decoder processor."""
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_processor_initialization(
self, mock_producer, mock_consumer
):
"""Test processor initialization with defaults."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
assert processor.partition_strategy == "auto"
assert processor.section_strategy_name == "whole-document"
assert processor.section_element_count == 20
assert processor.section_max_size == 4000
# Check specs: input consumer, output producer, triples producer
consumer_specs = [
s for s in processor.specifications if hasattr(s, 'handler')
]
assert len(consumer_specs) >= 1
assert consumer_specs[0].name == "input"
assert consumer_specs[0].schema == Document
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_processor_custom_strategy(
self, mock_producer, mock_consumer
):
"""Test processor initialization with custom section strategy."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
'section_strategy': 'heading',
'strategy': 'hi_res',
}
processor = Processor(**config)
assert processor.partition_strategy == "hi_res"
assert processor.section_strategy_name == "heading"
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_group_by_page(self, mock_producer, mock_consumer):
"""Test page grouping of elements."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
elements = [
make_element("NarrativeText", "Page 1 text", page_number=1),
make_element("NarrativeText", "More page 1", page_number=1),
make_element("NarrativeText", "Page 2 text", page_number=2),
]
result = processor.group_by_page(elements)
assert len(result) == 2
assert result[0][0] == 1 # page number
assert len(result[0][1]) == 2 # 2 elements on page 1
assert result[1][0] == 2
assert len(result[1][1]) == 1
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_on_message_inline_non_page(
self, mock_partition, mock_producer, mock_consumer
):
"""Test processing an inline non-page document."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
# Mock partition to return elements without page numbers
mock_partition.return_value = [
make_element("Title", "Document Title"),
make_element("NarrativeText", "Body text content."),
]
# Mock message with inline data
content = b"# Document Title\nBody text content."
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
# Mock flow
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
# Mock save_child_document and magic
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "text/markdown"
await processor.on_message(mock_msg, None, mock_flow)
# Should emit one section (whole-document strategy)
assert mock_output_flow.send.call_count == 1
assert mock_triples_flow.send.call_count == 1
# Check output
call_args = mock_output_flow.send.call_args[0][0]
assert isinstance(call_args, TextDocument)
assert call_args.document_id.startswith("urn:section:")
assert call_args.text == b""
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_on_message_page_based(
self, mock_partition, mock_producer, mock_consumer
):
"""Test processing a page-based document."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
# Mock partition to return elements with page numbers
mock_partition.return_value = [
make_element("NarrativeText", "Page 1 content", page_number=1),
make_element("NarrativeText", "Page 2 content", page_number=2),
]
# Mock message
content = b"fake pdf"
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "application/pdf"
await processor.on_message(mock_msg, None, mock_flow)
# Should emit two pages
assert mock_output_flow.send.call_count == 2
# Check first output uses page URI
call_args = mock_output_flow.send.call_args_list[0][0][0]
assert call_args.document_id.startswith("urn:page:")
@patch('trustgraph.decoding.universal.processor.Consumer')
@patch('trustgraph.decoding.universal.processor.Producer')
@patch('trustgraph.decoding.universal.processor.partition')
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
async def test_images_stored_not_emitted(
self, mock_partition, mock_producer, mock_consumer
):
"""Test that images are stored but not sent to text pipeline."""
config = {
'id': 'test-universal',
'taskgroup': AsyncMock(),
}
processor = Processor(**config)
mock_partition.return_value = [
make_element("NarrativeText", "Some text", page_number=1),
make_element("Image", "img ocr", page_number=1,
image_base64="aW1hZ2VkYXRh"),
]
content = b"fake pdf"
mock_metadata = Metadata(id="test-doc", user="testuser",
collection="default")
mock_document = Document(
metadata=mock_metadata,
data=base64.b64encode(content).decode('utf-8'),
)
mock_msg = MagicMock()
mock_msg.value.return_value = mock_document
mock_output_flow = AsyncMock()
mock_triples_flow = AsyncMock()
mock_flow = MagicMock(side_effect=lambda name: {
"output": mock_output_flow,
"triples": mock_triples_flow,
}.get(name))
processor.save_child_document = AsyncMock(return_value="mock-id")
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
mock_magic.from_buffer.return_value = "application/pdf"
await processor.on_message(mock_msg, None, mock_flow)
# Only 1 TextDocument output (the page text, not the image)
assert mock_output_flow.send.call_count == 1
# But 2 triples outputs (page provenance + image provenance)
assert mock_triples_flow.send.call_count == 2
# save_child_document called twice (page + image)
assert processor.save_child_document.call_count == 2
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
def test_add_args(self, mock_parent_add_args):
"""Test add_args registers all expected arguments."""
mock_parser = MagicMock()
Processor.add_args(mock_parser)
mock_parent_add_args.assert_called_once_with(mock_parser)
# Check key arguments are registered
arg_names = [
c[0] for c in mock_parser.add_argument.call_args_list
]
assert ('--strategy',) in arg_names
assert ('--languages',) in arg_names
assert ('--section-strategy',) in arg_names
assert ('--section-element-count',) in arg_names
assert ('--section-max-size',) in arg_names
assert ('--section-within-pages',) in arg_names
@patch('trustgraph.decoding.universal.processor.Processor.launch')
def test_run(self, mock_launch):
"""Test run function."""
from trustgraph.decoding.universal.processor import run
run()
mock_launch.assert_called_once()
args = mock_launch.call_args[0]
assert args[0] == "document-decoder"
assert "Universal document decoder" in args[1]
if __name__ == '__main__':
pytest.main([__file__])