mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
|
|
@ -240,7 +240,7 @@ class TestMistralOcrProcessor(IsolatedAsyncioTestCase):
|
|||
|
||||
mock_launch.assert_called_once()
|
||||
args = mock_launch.call_args[0]
|
||||
assert args[0] == "pdf-decoder"
|
||||
assert args[0] == "document-decoder"
|
||||
assert "Mistral OCR decoder" in args[1]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
"""Test run function"""
|
||||
from trustgraph.decoding.pdf.pdf_decoder import run
|
||||
run()
|
||||
mock_launch.assert_called_once_with("pdf-decoder",
|
||||
mock_launch.assert_called_once_with("document-decoder",
|
||||
"\nSimple decoder, accepts PDF documents on input, outputs pages from the\nPDF document as text as separate output objects.\n\nSupports both inline document data and fetching from librarian via Pulsar\nfor large documents.\n")
|
||||
|
||||
|
||||
|
|
|
|||
412
tests/unit/test_decoding/test_universal_processor.py
Normal file
412
tests/unit/test_decoding/test_universal_processor.py
Normal file
|
|
@ -0,0 +1,412 @@
|
|||
"""
|
||||
Unit tests for trustgraph.decoding.universal.processor
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import base64
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from unittest import IsolatedAsyncioTestCase
|
||||
|
||||
from trustgraph.decoding.universal.processor import (
|
||||
Processor, assemble_section_text, MIME_EXTENSIONS, PAGE_BASED_FORMATS,
|
||||
)
|
||||
from trustgraph.schema import Document, TextDocument, Metadata, Triples
|
||||
|
||||
|
||||
class MockAsyncProcessor:
|
||||
def __init__(self, **params):
|
||||
self.config_handlers = []
|
||||
self.id = params.get('id', 'test-service')
|
||||
self.specifications = []
|
||||
self.pubsub = MagicMock()
|
||||
self.taskgroup = params.get('taskgroup', MagicMock())
|
||||
|
||||
|
||||
def make_element(category="NarrativeText", text="Some text",
|
||||
page_number=None, text_as_html=None, image_base64=None):
|
||||
"""Create a mock unstructured element."""
|
||||
el = MagicMock()
|
||||
el.category = category
|
||||
el.text = text
|
||||
el.metadata = MagicMock()
|
||||
el.metadata.page_number = page_number
|
||||
el.metadata.text_as_html = text_as_html
|
||||
el.metadata.image_base64 = image_base64
|
||||
return el
|
||||
|
||||
|
||||
class TestAssembleSectionText:
|
||||
"""Test the text assembly function."""
|
||||
|
||||
def test_narrative_text(self):
|
||||
elements = [
|
||||
make_element("NarrativeText", "Paragraph one."),
|
||||
make_element("NarrativeText", "Paragraph two."),
|
||||
]
|
||||
text, types, tables, images = assemble_section_text(elements)
|
||||
assert text == "Paragraph one.\n\nParagraph two."
|
||||
assert "NarrativeText" in types
|
||||
assert tables == 0
|
||||
assert images == 0
|
||||
|
||||
def test_table_with_html(self):
|
||||
elements = [
|
||||
make_element("NarrativeText", "Before table."),
|
||||
make_element(
|
||||
"Table", "Col1 Col2",
|
||||
text_as_html="<table><tr><td>Col1</td><td>Col2</td></tr></table>"
|
||||
),
|
||||
]
|
||||
text, types, tables, images = assemble_section_text(elements)
|
||||
assert "<table>" in text
|
||||
assert "Before table." in text
|
||||
assert tables == 1
|
||||
assert "Table" in types
|
||||
|
||||
def test_table_without_html_fallback(self):
|
||||
el = make_element("Table", "plain table text")
|
||||
el.metadata.text_as_html = None
|
||||
elements = [el]
|
||||
text, types, tables, images = assemble_section_text(elements)
|
||||
assert text == "plain table text"
|
||||
assert tables == 1
|
||||
|
||||
def test_images_skipped(self):
|
||||
elements = [
|
||||
make_element("NarrativeText", "Text content"),
|
||||
make_element("Image", "OCR text from image"),
|
||||
]
|
||||
text, types, tables, images = assemble_section_text(elements)
|
||||
assert "OCR text" not in text
|
||||
assert "Text content" in text
|
||||
assert images == 1
|
||||
assert "Image" in types
|
||||
|
||||
def test_empty_elements(self):
|
||||
text, types, tables, images = assemble_section_text([])
|
||||
assert text == ""
|
||||
assert len(types) == 0
|
||||
assert tables == 0
|
||||
assert images == 0
|
||||
|
||||
def test_mixed_elements(self):
|
||||
elements = [
|
||||
make_element("Title", "Section Heading"),
|
||||
make_element("NarrativeText", "Body text."),
|
||||
make_element(
|
||||
"Table", "data",
|
||||
text_as_html="<table><tr><td>data</td></tr></table>"
|
||||
),
|
||||
make_element("Image", "img text"),
|
||||
make_element("ListItem", "- item one"),
|
||||
]
|
||||
text, types, tables, images = assemble_section_text(elements)
|
||||
assert "Section Heading" in text
|
||||
assert "Body text." in text
|
||||
assert "<table>" in text
|
||||
assert "img text" not in text
|
||||
assert "- item one" in text
|
||||
assert tables == 1
|
||||
assert images == 1
|
||||
assert {"Title", "NarrativeText", "Table", "Image", "ListItem"} == types
|
||||
|
||||
|
||||
class TestMimeExtensions:
|
||||
"""Test the mime type to extension mapping."""
|
||||
|
||||
def test_pdf_extension(self):
|
||||
assert MIME_EXTENSIONS["application/pdf"] == ".pdf"
|
||||
|
||||
def test_docx_extension(self):
|
||||
key = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
assert MIME_EXTENSIONS[key] == ".docx"
|
||||
|
||||
def test_html_extension(self):
|
||||
assert MIME_EXTENSIONS["text/html"] == ".html"
|
||||
|
||||
|
||||
class TestPageBasedFormats:
|
||||
"""Test page-based format detection."""
|
||||
|
||||
def test_pdf_is_page_based(self):
|
||||
assert "application/pdf" in PAGE_BASED_FORMATS
|
||||
|
||||
def test_html_is_not_page_based(self):
|
||||
assert "text/html" not in PAGE_BASED_FORMATS
|
||||
|
||||
def test_pptx_is_page_based(self):
|
||||
pptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
assert pptx in PAGE_BASED_FORMATS
|
||||
|
||||
|
||||
class TestUniversalProcessor(IsolatedAsyncioTestCase):
|
||||
"""Test universal decoder processor."""
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_processor_initialization(
|
||||
self, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test processor initialization with defaults."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
assert processor.partition_strategy == "auto"
|
||||
assert processor.section_strategy_name == "whole-document"
|
||||
assert processor.section_element_count == 20
|
||||
assert processor.section_max_size == 4000
|
||||
|
||||
# Check specs: input consumer, output producer, triples producer
|
||||
consumer_specs = [
|
||||
s for s in processor.specifications if hasattr(s, 'handler')
|
||||
]
|
||||
assert len(consumer_specs) >= 1
|
||||
assert consumer_specs[0].name == "input"
|
||||
assert consumer_specs[0].schema == Document
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_processor_custom_strategy(
|
||||
self, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test processor initialization with custom section strategy."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
'section_strategy': 'heading',
|
||||
'strategy': 'hi_res',
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
assert processor.partition_strategy == "hi_res"
|
||||
assert processor.section_strategy_name == "heading"
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_group_by_page(self, mock_producer, mock_consumer):
|
||||
"""Test page grouping of elements."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
elements = [
|
||||
make_element("NarrativeText", "Page 1 text", page_number=1),
|
||||
make_element("NarrativeText", "More page 1", page_number=1),
|
||||
make_element("NarrativeText", "Page 2 text", page_number=2),
|
||||
]
|
||||
|
||||
result = processor.group_by_page(elements)
|
||||
|
||||
assert len(result) == 2
|
||||
assert result[0][0] == 1 # page number
|
||||
assert len(result[0][1]) == 2 # 2 elements on page 1
|
||||
assert result[1][0] == 2
|
||||
assert len(result[1][1]) == 1
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.decoding.universal.processor.partition')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_on_message_inline_non_page(
|
||||
self, mock_partition, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test processing an inline non-page document."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock partition to return elements without page numbers
|
||||
mock_partition.return_value = [
|
||||
make_element("Title", "Document Title"),
|
||||
make_element("NarrativeText", "Body text content."),
|
||||
]
|
||||
|
||||
# Mock message with inline data
|
||||
content = b"# Document Title\nBody text content."
|
||||
mock_metadata = Metadata(id="test-doc", user="testuser",
|
||||
collection="default")
|
||||
mock_document = Document(
|
||||
metadata=mock_metadata,
|
||||
data=base64.b64encode(content).decode('utf-8'),
|
||||
)
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
# Mock flow
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
# Mock save_child_document and magic
|
||||
processor.save_child_document = AsyncMock(return_value="mock-id")
|
||||
|
||||
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
||||
mock_magic.from_buffer.return_value = "text/markdown"
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
# Should emit one section (whole-document strategy)
|
||||
assert mock_output_flow.send.call_count == 1
|
||||
assert mock_triples_flow.send.call_count == 1
|
||||
|
||||
# Check output
|
||||
call_args = mock_output_flow.send.call_args[0][0]
|
||||
assert isinstance(call_args, TextDocument)
|
||||
assert call_args.document_id.startswith("urn:section:")
|
||||
assert call_args.text == b""
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.decoding.universal.processor.partition')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_on_message_page_based(
|
||||
self, mock_partition, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test processing a page-based document."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock partition to return elements with page numbers
|
||||
mock_partition.return_value = [
|
||||
make_element("NarrativeText", "Page 1 content", page_number=1),
|
||||
make_element("NarrativeText", "Page 2 content", page_number=2),
|
||||
]
|
||||
|
||||
# Mock message
|
||||
content = b"fake pdf"
|
||||
mock_metadata = Metadata(id="test-doc", user="testuser",
|
||||
collection="default")
|
||||
mock_document = Document(
|
||||
metadata=mock_metadata,
|
||||
data=base64.b64encode(content).decode('utf-8'),
|
||||
)
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
processor.save_child_document = AsyncMock(return_value="mock-id")
|
||||
|
||||
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
||||
mock_magic.from_buffer.return_value = "application/pdf"
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
# Should emit two pages
|
||||
assert mock_output_flow.send.call_count == 2
|
||||
|
||||
# Check first output uses page URI
|
||||
call_args = mock_output_flow.send.call_args_list[0][0][0]
|
||||
assert call_args.document_id.startswith("urn:page:")
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Consumer')
|
||||
@patch('trustgraph.decoding.universal.processor.Producer')
|
||||
@patch('trustgraph.decoding.universal.processor.partition')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_images_stored_not_emitted(
|
||||
self, mock_partition, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test that images are stored but not sent to text pipeline."""
|
||||
config = {
|
||||
'id': 'test-universal',
|
||||
'taskgroup': AsyncMock(),
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
mock_partition.return_value = [
|
||||
make_element("NarrativeText", "Some text", page_number=1),
|
||||
make_element("Image", "img ocr", page_number=1,
|
||||
image_base64="aW1hZ2VkYXRh"),
|
||||
]
|
||||
|
||||
content = b"fake pdf"
|
||||
mock_metadata = Metadata(id="test-doc", user="testuser",
|
||||
collection="default")
|
||||
mock_document = Document(
|
||||
metadata=mock_metadata,
|
||||
data=base64.b64encode(content).decode('utf-8'),
|
||||
)
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
processor.save_child_document = AsyncMock(return_value="mock-id")
|
||||
|
||||
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
||||
mock_magic.from_buffer.return_value = "application/pdf"
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
# Only 1 TextDocument output (the page text, not the image)
|
||||
assert mock_output_flow.send.call_count == 1
|
||||
|
||||
# But 2 triples outputs (page provenance + image provenance)
|
||||
assert mock_triples_flow.send.call_count == 2
|
||||
|
||||
# save_child_document called twice (page + image)
|
||||
assert processor.save_child_document.call_count == 2
|
||||
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
|
||||
def test_add_args(self, mock_parent_add_args):
|
||||
"""Test add_args registers all expected arguments."""
|
||||
mock_parser = MagicMock()
|
||||
|
||||
Processor.add_args(mock_parser)
|
||||
|
||||
mock_parent_add_args.assert_called_once_with(mock_parser)
|
||||
|
||||
# Check key arguments are registered
|
||||
arg_names = [
|
||||
c[0] for c in mock_parser.add_argument.call_args_list
|
||||
]
|
||||
assert ('--strategy',) in arg_names
|
||||
assert ('--languages',) in arg_names
|
||||
assert ('--section-strategy',) in arg_names
|
||||
assert ('--section-element-count',) in arg_names
|
||||
assert ('--section-max-size',) in arg_names
|
||||
assert ('--section-within-pages',) in arg_names
|
||||
|
||||
@patch('trustgraph.decoding.universal.processor.Processor.launch')
|
||||
def test_run(self, mock_launch):
|
||||
"""Test run function."""
|
||||
from trustgraph.decoding.universal.processor import run
|
||||
run()
|
||||
|
||||
mock_launch.assert_called_once()
|
||||
args = mock_launch.call_args[0]
|
||||
assert args[0] == "document-decoder"
|
||||
assert "Universal document decoder" in args[1]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__])
|
||||
204
tests/unit/test_decoding/test_universal_strategies.py
Normal file
204
tests/unit/test_decoding/test_universal_strategies.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
Unit tests for universal decoder section grouping strategies.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
from trustgraph.decoding.universal.strategies import (
|
||||
group_whole_document,
|
||||
group_by_heading,
|
||||
group_by_element_type,
|
||||
group_by_count,
|
||||
group_by_size,
|
||||
get_strategy,
|
||||
STRATEGIES,
|
||||
)
|
||||
|
||||
|
||||
def make_element(category="NarrativeText", text="Some text"):
|
||||
"""Create a mock unstructured element."""
|
||||
el = MagicMock()
|
||||
el.category = category
|
||||
el.text = text
|
||||
return el
|
||||
|
||||
|
||||
class TestGroupWholeDocument:
|
||||
|
||||
def test_empty_input(self):
|
||||
assert group_whole_document([]) == []
|
||||
|
||||
def test_returns_single_group(self):
|
||||
elements = [make_element() for _ in range(5)]
|
||||
result = group_whole_document(elements)
|
||||
assert len(result) == 1
|
||||
assert len(result[0]) == 5
|
||||
|
||||
def test_preserves_all_elements(self):
|
||||
elements = [make_element(text=f"text-{i}") for i in range(3)]
|
||||
result = group_whole_document(elements)
|
||||
assert result[0] == elements
|
||||
|
||||
|
||||
class TestGroupByHeading:
|
||||
|
||||
def test_empty_input(self):
|
||||
assert group_by_heading([]) == []
|
||||
|
||||
def test_no_headings_falls_back(self):
|
||||
elements = [make_element("NarrativeText") for _ in range(3)]
|
||||
result = group_by_heading(elements)
|
||||
assert len(result) == 1
|
||||
assert len(result[0]) == 3
|
||||
|
||||
def test_splits_at_headings(self):
|
||||
elements = [
|
||||
make_element("Title", "Heading 1"),
|
||||
make_element("NarrativeText", "Paragraph 1"),
|
||||
make_element("NarrativeText", "Paragraph 2"),
|
||||
make_element("Title", "Heading 2"),
|
||||
make_element("NarrativeText", "Paragraph 3"),
|
||||
]
|
||||
result = group_by_heading(elements)
|
||||
assert len(result) == 2
|
||||
assert len(result[0]) == 3 # Heading 1 + 2 paragraphs
|
||||
assert len(result[1]) == 2 # Heading 2 + 1 paragraph
|
||||
|
||||
def test_leading_content_before_first_heading(self):
|
||||
elements = [
|
||||
make_element("NarrativeText", "Preamble"),
|
||||
make_element("Title", "Heading 1"),
|
||||
make_element("NarrativeText", "Content"),
|
||||
]
|
||||
result = group_by_heading(elements)
|
||||
assert len(result) == 2
|
||||
assert len(result[0]) == 1 # Preamble
|
||||
assert len(result[1]) == 2 # Heading + content
|
||||
|
||||
def test_consecutive_headings(self):
|
||||
elements = [
|
||||
make_element("Title", "H1"),
|
||||
make_element("Title", "H2"),
|
||||
make_element("NarrativeText", "Content"),
|
||||
]
|
||||
result = group_by_heading(elements)
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
class TestGroupByElementType:
|
||||
|
||||
def test_empty_input(self):
|
||||
assert group_by_element_type([]) == []
|
||||
|
||||
def test_all_same_type(self):
|
||||
elements = [make_element("NarrativeText") for _ in range(3)]
|
||||
result = group_by_element_type(elements)
|
||||
assert len(result) == 1
|
||||
|
||||
def test_splits_at_table_boundary(self):
|
||||
elements = [
|
||||
make_element("NarrativeText", "Intro"),
|
||||
make_element("NarrativeText", "More text"),
|
||||
make_element("Table", "Table data"),
|
||||
make_element("NarrativeText", "After table"),
|
||||
]
|
||||
result = group_by_element_type(elements)
|
||||
assert len(result) == 3
|
||||
assert len(result[0]) == 2 # Two narrative elements
|
||||
assert len(result[1]) == 1 # One table
|
||||
assert len(result[2]) == 1 # One narrative
|
||||
|
||||
def test_consecutive_tables_stay_grouped(self):
|
||||
elements = [
|
||||
make_element("Table", "Table 1"),
|
||||
make_element("Table", "Table 2"),
|
||||
]
|
||||
result = group_by_element_type(elements)
|
||||
assert len(result) == 1
|
||||
assert len(result[0]) == 2
|
||||
|
||||
|
||||
class TestGroupByCount:
|
||||
|
||||
def test_empty_input(self):
|
||||
assert group_by_count([]) == []
|
||||
|
||||
def test_exact_multiple(self):
|
||||
elements = [make_element() for _ in range(6)]
|
||||
result = group_by_count(elements, element_count=3)
|
||||
assert len(result) == 2
|
||||
assert all(len(g) == 3 for g in result)
|
||||
|
||||
def test_remainder_group(self):
|
||||
elements = [make_element() for _ in range(7)]
|
||||
result = group_by_count(elements, element_count=3)
|
||||
assert len(result) == 3
|
||||
assert len(result[0]) == 3
|
||||
assert len(result[1]) == 3
|
||||
assert len(result[2]) == 1
|
||||
|
||||
def test_fewer_than_count(self):
|
||||
elements = [make_element() for _ in range(2)]
|
||||
result = group_by_count(elements, element_count=10)
|
||||
assert len(result) == 1
|
||||
assert len(result[0]) == 2
|
||||
|
||||
|
||||
class TestGroupBySize:
|
||||
|
||||
def test_empty_input(self):
|
||||
assert group_by_size([]) == []
|
||||
|
||||
def test_small_elements_grouped(self):
|
||||
elements = [make_element(text="Hi") for _ in range(5)]
|
||||
result = group_by_size(elements, max_size=100)
|
||||
assert len(result) == 1
|
||||
|
||||
def test_splits_at_size_limit(self):
|
||||
elements = [make_element(text="x" * 100) for _ in range(5)]
|
||||
result = group_by_size(elements, max_size=250)
|
||||
# 2 elements per group (200 chars), then split
|
||||
assert len(result) == 3
|
||||
assert len(result[0]) == 2
|
||||
assert len(result[1]) == 2
|
||||
assert len(result[2]) == 1
|
||||
|
||||
def test_large_element_own_group(self):
|
||||
elements = [
|
||||
make_element(text="small"),
|
||||
make_element(text="x" * 5000), # Exceeds max
|
||||
make_element(text="small"),
|
||||
]
|
||||
result = group_by_size(elements, max_size=100)
|
||||
assert len(result) == 3
|
||||
|
||||
def test_respects_element_boundaries(self):
|
||||
# Each element is 50 chars, max is 120
|
||||
# Should get 2 per group, not split mid-element
|
||||
elements = [make_element(text="x" * 50) for _ in range(5)]
|
||||
result = group_by_size(elements, max_size=120)
|
||||
assert len(result) == 3
|
||||
assert len(result[0]) == 2
|
||||
assert len(result[1]) == 2
|
||||
assert len(result[2]) == 1
|
||||
|
||||
|
||||
class TestGetStrategy:
|
||||
|
||||
def test_all_strategies_accessible(self):
|
||||
for name in STRATEGIES:
|
||||
fn = get_strategy(name)
|
||||
assert callable(fn)
|
||||
|
||||
def test_unknown_strategy_raises(self):
|
||||
with pytest.raises(ValueError, match="Unknown section strategy"):
|
||||
get_strategy("nonexistent")
|
||||
|
||||
def test_returns_correct_function(self):
|
||||
assert get_strategy("whole-document") is group_whole_document
|
||||
assert get_strategy("heading") is group_by_heading
|
||||
assert get_strategy("element-type") is group_by_element_type
|
||||
assert get_strategy("count") is group_by_count
|
||||
assert get_strategy("size") is group_by_size
|
||||
Loading…
Add table
Add a link
Reference in a new issue