mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Introduces `workspace` as the isolation boundary for config, flows,
library, and knowledge data. Removes `user` as a schema-level field
throughout the code, API specs, and tests; workspace provides the
same separation more cleanly at the trusted flow.workspace layer
rather than through client-supplied message fields.
Design
------
- IAM tech spec (docs/tech-specs/iam.md) documents current state,
proposed auth/access model, and migration direction.
- Data ownership model (docs/tech-specs/data-ownership-model.md)
captures the workspace/collection/flow hierarchy.
Schema + messaging
------------------
- Drop `user` field from AgentRequest/Step, GraphRagQuery,
DocumentRagQuery, Triples/Graph/Document/Row EmbeddingsRequest,
Sparql/Rows/Structured QueryRequest, ToolServiceRequest.
- Keep collection/workspace routing via flow.workspace at the
service layer.
- Translators updated to not serialise/deserialise user.
API specs
---------
- OpenAPI schemas and path examples cleaned of user fields.
- Websocket async-api messages updated.
- Removed the unused parameters/User.yaml.
Services + base
---------------
- Librarian, collection manager, knowledge, config: all operations
scoped by workspace. Config client API takes workspace as first
positional arg.
- `flow.workspace` set at flow start time by the infrastructure;
no longer pass-through from clients.
- Tool service drops user-personalisation passthrough.
CLI + SDK
---------
- tg-init-workspace and workspace-aware import/export.
- All tg-* commands drop user args; accept --workspace.
- Python API/SDK (flow, socket_client, async_*, explainability,
library) drop user kwargs from every method signature.
MCP server
----------
- All tool endpoints drop user parameters; socket_manager no longer
keyed per user.
Flow service
------------
- Closure-based topic cleanup on flow stop: only delete topics
whose blueprint template was parameterised AND no remaining
live flow (across all workspaces) still resolves to that topic.
Three scopes fall out naturally from template analysis:
* {id} -> per-flow, deleted on stop
* {blueprint} -> per-blueprint, kept while any flow of the
same blueprint exists
* {workspace} -> per-workspace, kept while any flow in the
workspace exists
* literal -> global, never deleted (e.g. tg.request.librarian)
Fixes a bug where stopping a flow silently destroyed the global
librarian exchange, wedging all library operations until manual
restart.
RabbitMQ backend
----------------
- heartbeat=60, blocked_connection_timeout=300. Catches silently
dead connections (broker restart, orphaned channels, network
partitions) within ~2 heartbeat windows, so the consumer
reconnects and re-binds its queue rather than sitting forever
on a zombie connection.
Tests
-----
- Full test refresh: unit, integration, contract, provenance.
- Dropped user-field assertions and constructor kwargs across
~100 test files.
- Renamed user-collection isolation tests to workspace-collection.
412 lines
15 KiB
Python
412 lines
15 KiB
Python
"""
|
|
Unit tests for trustgraph.decoding.universal.processor
|
|
"""
|
|
|
|
import pytest
|
|
import base64
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from unittest import IsolatedAsyncioTestCase
|
|
|
|
from trustgraph.decoding.universal.processor import (
|
|
Processor, assemble_section_text, MIME_EXTENSIONS, PAGE_BASED_FORMATS,
|
|
)
|
|
from trustgraph.schema import Document, TextDocument, Metadata, Triples
|
|
|
|
|
|
class MockAsyncProcessor:
|
|
def __init__(self, **params):
|
|
self.config_handlers = []
|
|
self.id = params.get('id', 'test-service')
|
|
self.specifications = []
|
|
self.pubsub = MagicMock()
|
|
self.taskgroup = params.get('taskgroup', MagicMock())
|
|
|
|
|
|
def make_element(category="NarrativeText", text="Some text",
|
|
page_number=None, text_as_html=None, image_base64=None):
|
|
"""Create a mock unstructured element."""
|
|
el = MagicMock()
|
|
el.category = category
|
|
el.text = text
|
|
el.metadata = MagicMock()
|
|
el.metadata.page_number = page_number
|
|
el.metadata.text_as_html = text_as_html
|
|
el.metadata.image_base64 = image_base64
|
|
return el
|
|
|
|
|
|
class TestAssembleSectionText:
|
|
"""Test the text assembly function."""
|
|
|
|
def test_narrative_text(self):
|
|
elements = [
|
|
make_element("NarrativeText", "Paragraph one."),
|
|
make_element("NarrativeText", "Paragraph two."),
|
|
]
|
|
text, types, tables, images = assemble_section_text(elements)
|
|
assert text == "Paragraph one.\n\nParagraph two."
|
|
assert "NarrativeText" in types
|
|
assert tables == 0
|
|
assert images == 0
|
|
|
|
def test_table_with_html(self):
|
|
elements = [
|
|
make_element("NarrativeText", "Before table."),
|
|
make_element(
|
|
"Table", "Col1 Col2",
|
|
text_as_html="<table><tr><td>Col1</td><td>Col2</td></tr></table>"
|
|
),
|
|
]
|
|
text, types, tables, images = assemble_section_text(elements)
|
|
assert "<table>" in text
|
|
assert "Before table." in text
|
|
assert tables == 1
|
|
assert "Table" in types
|
|
|
|
def test_table_without_html_fallback(self):
|
|
el = make_element("Table", "plain table text")
|
|
el.metadata.text_as_html = None
|
|
elements = [el]
|
|
text, types, tables, images = assemble_section_text(elements)
|
|
assert text == "plain table text"
|
|
assert tables == 1
|
|
|
|
def test_images_skipped(self):
|
|
elements = [
|
|
make_element("NarrativeText", "Text content"),
|
|
make_element("Image", "OCR text from image"),
|
|
]
|
|
text, types, tables, images = assemble_section_text(elements)
|
|
assert "OCR text" not in text
|
|
assert "Text content" in text
|
|
assert images == 1
|
|
assert "Image" in types
|
|
|
|
def test_empty_elements(self):
|
|
text, types, tables, images = assemble_section_text([])
|
|
assert text == ""
|
|
assert len(types) == 0
|
|
assert tables == 0
|
|
assert images == 0
|
|
|
|
def test_mixed_elements(self):
|
|
elements = [
|
|
make_element("Title", "Section Heading"),
|
|
make_element("NarrativeText", "Body text."),
|
|
make_element(
|
|
"Table", "data",
|
|
text_as_html="<table><tr><td>data</td></tr></table>"
|
|
),
|
|
make_element("Image", "img text"),
|
|
make_element("ListItem", "- item one"),
|
|
]
|
|
text, types, tables, images = assemble_section_text(elements)
|
|
assert "Section Heading" in text
|
|
assert "Body text." in text
|
|
assert "<table>" in text
|
|
assert "img text" not in text
|
|
assert "- item one" in text
|
|
assert tables == 1
|
|
assert images == 1
|
|
assert {"Title", "NarrativeText", "Table", "Image", "ListItem"} == types
|
|
|
|
|
|
class TestMimeExtensions:
|
|
"""Test the mime type to extension mapping."""
|
|
|
|
def test_pdf_extension(self):
|
|
assert MIME_EXTENSIONS["application/pdf"] == ".pdf"
|
|
|
|
def test_docx_extension(self):
|
|
key = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
assert MIME_EXTENSIONS[key] == ".docx"
|
|
|
|
def test_html_extension(self):
|
|
assert MIME_EXTENSIONS["text/html"] == ".html"
|
|
|
|
|
|
class TestPageBasedFormats:
|
|
"""Test page-based format detection."""
|
|
|
|
def test_pdf_is_page_based(self):
|
|
assert "application/pdf" in PAGE_BASED_FORMATS
|
|
|
|
def test_html_is_not_page_based(self):
|
|
assert "text/html" not in PAGE_BASED_FORMATS
|
|
|
|
def test_pptx_is_page_based(self):
|
|
pptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
assert pptx in PAGE_BASED_FORMATS
|
|
|
|
|
|
class TestUniversalProcessor(IsolatedAsyncioTestCase):
|
|
"""Test universal decoder processor."""
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_processor_initialization(
|
|
self, mock_producer, mock_consumer
|
|
):
|
|
"""Test processor initialization with defaults."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
assert processor.partition_strategy == "auto"
|
|
assert processor.section_strategy_name == "whole-document"
|
|
assert processor.section_element_count == 20
|
|
assert processor.section_max_size == 4000
|
|
|
|
# Check specs: input consumer, output producer, triples producer
|
|
consumer_specs = [
|
|
s for s in processor.specifications if hasattr(s, 'handler')
|
|
]
|
|
assert len(consumer_specs) >= 1
|
|
assert consumer_specs[0].name == "input"
|
|
assert consumer_specs[0].schema == Document
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_processor_custom_strategy(
|
|
self, mock_producer, mock_consumer
|
|
):
|
|
"""Test processor initialization with custom section strategy."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
'section_strategy': 'heading',
|
|
'strategy': 'hi_res',
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
assert processor.partition_strategy == "hi_res"
|
|
assert processor.section_strategy_name == "heading"
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_group_by_page(self, mock_producer, mock_consumer):
|
|
"""Test page grouping of elements."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
elements = [
|
|
make_element("NarrativeText", "Page 1 text", page_number=1),
|
|
make_element("NarrativeText", "More page 1", page_number=1),
|
|
make_element("NarrativeText", "Page 2 text", page_number=2),
|
|
]
|
|
|
|
result = processor.group_by_page(elements)
|
|
|
|
assert len(result) == 2
|
|
assert result[0][0] == 1 # page number
|
|
assert len(result[0][1]) == 2 # 2 elements on page 1
|
|
assert result[1][0] == 2
|
|
assert len(result[1][1]) == 1
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.decoding.universal.processor.partition')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_on_message_inline_non_page(
|
|
self, mock_partition, mock_producer, mock_consumer
|
|
):
|
|
"""Test processing an inline non-page document."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
# Mock partition to return elements without page numbers
|
|
mock_partition.return_value = [
|
|
make_element("Title", "Document Title"),
|
|
make_element("NarrativeText", "Body text content."),
|
|
]
|
|
|
|
# Mock message with inline data
|
|
content = b"# Document Title\nBody text content."
|
|
mock_metadata = Metadata(id="test-doc",
|
|
collection="default")
|
|
mock_document = Document(
|
|
metadata=mock_metadata,
|
|
data=base64.b64encode(content).decode('utf-8'),
|
|
)
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = mock_document
|
|
|
|
# Mock flow
|
|
mock_output_flow = AsyncMock()
|
|
mock_triples_flow = AsyncMock()
|
|
mock_flow = MagicMock(side_effect=lambda name: {
|
|
"output": mock_output_flow,
|
|
"triples": mock_triples_flow,
|
|
}.get(name))
|
|
|
|
# Mock save_child_document and magic
|
|
processor.librarian.save_child_document = AsyncMock(return_value="mock-id")
|
|
|
|
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
|
mock_magic.from_buffer.return_value = "text/markdown"
|
|
await processor.on_message(mock_msg, None, mock_flow)
|
|
|
|
# Should emit one section (whole-document strategy)
|
|
assert mock_output_flow.send.call_count == 1
|
|
assert mock_triples_flow.send.call_count == 1
|
|
|
|
# Check output
|
|
call_args = mock_output_flow.send.call_args[0][0]
|
|
assert isinstance(call_args, TextDocument)
|
|
assert call_args.document_id.startswith("urn:section:")
|
|
assert call_args.text == b""
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.decoding.universal.processor.partition')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_on_message_page_based(
|
|
self, mock_partition, mock_producer, mock_consumer
|
|
):
|
|
"""Test processing a page-based document."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
# Mock partition to return elements with page numbers
|
|
mock_partition.return_value = [
|
|
make_element("NarrativeText", "Page 1 content", page_number=1),
|
|
make_element("NarrativeText", "Page 2 content", page_number=2),
|
|
]
|
|
|
|
# Mock message
|
|
content = b"fake pdf"
|
|
mock_metadata = Metadata(id="test-doc",
|
|
collection="default")
|
|
mock_document = Document(
|
|
metadata=mock_metadata,
|
|
data=base64.b64encode(content).decode('utf-8'),
|
|
)
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = mock_document
|
|
|
|
mock_output_flow = AsyncMock()
|
|
mock_triples_flow = AsyncMock()
|
|
mock_flow = MagicMock(side_effect=lambda name: {
|
|
"output": mock_output_flow,
|
|
"triples": mock_triples_flow,
|
|
}.get(name))
|
|
|
|
processor.librarian.save_child_document = AsyncMock(return_value="mock-id")
|
|
|
|
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
|
mock_magic.from_buffer.return_value = "application/pdf"
|
|
await processor.on_message(mock_msg, None, mock_flow)
|
|
|
|
# Should emit two pages
|
|
assert mock_output_flow.send.call_count == 2
|
|
|
|
# Check first output uses page URI
|
|
call_args = mock_output_flow.send.call_args_list[0][0][0]
|
|
assert call_args.document_id.startswith("urn:page:")
|
|
|
|
@patch('trustgraph.base.librarian_client.Consumer')
|
|
@patch('trustgraph.base.librarian_client.Producer')
|
|
@patch('trustgraph.decoding.universal.processor.partition')
|
|
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
|
async def test_images_stored_not_emitted(
|
|
self, mock_partition, mock_producer, mock_consumer
|
|
):
|
|
"""Test that images are stored but not sent to text pipeline."""
|
|
config = {
|
|
'id': 'test-universal',
|
|
'taskgroup': AsyncMock(),
|
|
}
|
|
|
|
processor = Processor(**config)
|
|
|
|
mock_partition.return_value = [
|
|
make_element("NarrativeText", "Some text", page_number=1),
|
|
make_element("Image", "img ocr", page_number=1,
|
|
image_base64="aW1hZ2VkYXRh"),
|
|
]
|
|
|
|
content = b"fake pdf"
|
|
mock_metadata = Metadata(id="test-doc",
|
|
collection="default")
|
|
mock_document = Document(
|
|
metadata=mock_metadata,
|
|
data=base64.b64encode(content).decode('utf-8'),
|
|
)
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = mock_document
|
|
|
|
mock_output_flow = AsyncMock()
|
|
mock_triples_flow = AsyncMock()
|
|
mock_flow = MagicMock(side_effect=lambda name: {
|
|
"output": mock_output_flow,
|
|
"triples": mock_triples_flow,
|
|
}.get(name))
|
|
|
|
processor.librarian.save_child_document = AsyncMock(return_value="mock-id")
|
|
|
|
with patch('trustgraph.decoding.universal.processor.magic') as mock_magic:
|
|
mock_magic.from_buffer.return_value = "application/pdf"
|
|
await processor.on_message(mock_msg, None, mock_flow)
|
|
|
|
# Only 1 TextDocument output (the page text, not the image)
|
|
assert mock_output_flow.send.call_count == 1
|
|
|
|
# But 2 triples outputs (page provenance + image provenance)
|
|
assert mock_triples_flow.send.call_count == 2
|
|
|
|
# save_child_document called twice (page + image)
|
|
assert processor.librarian.save_child_document.call_count == 2
|
|
|
|
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
|
|
def test_add_args(self, mock_parent_add_args):
|
|
"""Test add_args registers all expected arguments."""
|
|
mock_parser = MagicMock()
|
|
|
|
Processor.add_args(mock_parser)
|
|
|
|
mock_parent_add_args.assert_called_once_with(mock_parser)
|
|
|
|
# Check key arguments are registered
|
|
arg_names = [
|
|
c[0] for c in mock_parser.add_argument.call_args_list
|
|
]
|
|
assert ('--strategy',) in arg_names
|
|
assert ('--languages',) in arg_names
|
|
assert ('--section-strategy',) in arg_names
|
|
assert ('--section-element-count',) in arg_names
|
|
assert ('--section-max-size',) in arg_names
|
|
assert ('--section-within-pages',) in arg_names
|
|
|
|
@patch('trustgraph.decoding.universal.processor.Processor.launch')
|
|
def test_run(self, mock_launch):
|
|
"""Test run function."""
|
|
from trustgraph.decoding.universal.processor import run
|
|
run()
|
|
|
|
mock_launch.assert_called_once()
|
|
args = mock_launch.call_args[0]
|
|
assert args[0] == "document-decoder"
|
|
assert "Universal document decoder" in args[1]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__])
|