trustgraph/tests/integration/test_rows_cassandra_integration.py
cybermaggedon d35473f7f7
feat: workspace-based multi-tenancy, replacing user as tenancy axis (#840)
Introduces `workspace` as the isolation boundary for config, flows,
library, and knowledge data. Removes `user` as a schema-level field
throughout the code, API specs, and tests; workspace provides the
same separation more cleanly at the trusted flow.workspace layer
rather than through client-supplied message fields.

Design
------
- IAM tech spec (docs/tech-specs/iam.md) documents current state,
  proposed auth/access model, and migration direction.
- Data ownership model (docs/tech-specs/data-ownership-model.md)
  captures the workspace/collection/flow hierarchy.

Schema + messaging
------------------
- Drop `user` field from AgentRequest/Step, GraphRagQuery,
  DocumentRagQuery, Triples/Graph/Document/Row EmbeddingsRequest,
  Sparql/Rows/Structured QueryRequest, ToolServiceRequest.
- Keep collection/workspace routing via flow.workspace at the
  service layer.
- Translators updated to not serialise/deserialise user.

API specs
---------
- OpenAPI schemas and path examples cleaned of user fields.
- Websocket async-api messages updated.
- Removed the unused parameters/User.yaml.

Services + base
---------------
- Librarian, collection manager, knowledge, config: all operations
  scoped by workspace. Config client API takes workspace as first
  positional arg.
- `flow.workspace` set at flow start time by the infrastructure;
  no longer pass-through from clients.
- Tool service drops user-personalisation passthrough.

CLI + SDK
---------
- tg-init-workspace and workspace-aware import/export.
- All tg-* commands drop user args; accept --workspace.
- Python API/SDK (flow, socket_client, async_*, explainability,
  library) drop user kwargs from every method signature.

MCP server
----------
- All tool endpoints drop user parameters; socket_manager no longer
  keyed per user.

Flow service
------------
- Closure-based topic cleanup on flow stop: only delete topics
  whose blueprint template was parameterised AND no remaining
  live flow (across all workspaces) still resolves to that topic.
  Three scopes fall out naturally from template analysis:
    * {id} -> per-flow, deleted on stop
    * {blueprint} -> per-blueprint, kept while any flow of the
      same blueprint exists
    * {workspace} -> per-workspace, kept while any flow in the
      workspace exists
    * literal -> global, never deleted (e.g. tg.request.librarian)
  Fixes a bug where stopping a flow silently destroyed the global
  librarian exchange, wedging all library operations until manual
  restart.

RabbitMQ backend
----------------
- heartbeat=60, blocked_connection_timeout=300. Catches silently
  dead connections (broker restart, orphaned channels, network
  partitions) within ~2 heartbeat windows, so the consumer
  reconnects and re-binds its queue rather than sitting forever
  on a zombie connection.

Tests
-----
- Full test refresh: unit, integration, contract, provenance.
- Dropped user-field assertions and constructor kwargs across
  ~100 test files.
- Renamed user-collection isolation tests to workspace-collection.
2026-04-21 23:23:01 +01:00

521 lines
22 KiB
Python

"""
Integration tests for Cassandra Row Storage (Unified Table Implementation)
These tests verify the end-to-end functionality of storing ExtractedObjects
in the unified Cassandra rows table, including table creation, data insertion,
and error handling.
"""
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
import json
from trustgraph.storage.rows.cassandra.write import Processor
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
class _MockFlowDefault:
"""Mock Flow with default workspace for testing."""
workspace = "default"
name = "default"
id = "test-processor"
mock_flow_default = _MockFlowDefault()
@pytest.mark.integration
class TestRowsCassandraIntegration:
"""Integration tests for Cassandra row storage with unified table"""
@pytest.fixture(autouse=True)
def patch_async_execute(self):
"""Route async_execute through session.execute so the mock's
side_effect handles all CQL (DDL and DML) uniformly and every
call lands in mock_session.execute.call_args_list."""
async def _fake(session, query, params=None):
session.execute(query, params)
return []
with patch(
'trustgraph.storage.rows.cassandra.write.async_execute',
new=_fake,
):
yield
@pytest.fixture
def mock_cassandra_session(self):
"""Mock Cassandra session for integration tests"""
session = MagicMock()
# Track if keyspaces have been created
created_keyspaces = set()
# Mock the execute method to return a valid result for keyspace checks
def execute_mock(query, *args, **kwargs):
result = MagicMock()
query_str = str(query)
# Track keyspace creation
if "CREATE KEYSPACE" in query_str:
import re
match = re.search(r'CREATE KEYSPACE IF NOT EXISTS (\w+)', query_str)
if match:
created_keyspaces.add(match.group(1))
# For keyspace existence checks
if "system_schema.keyspaces" in query_str:
if args and args[0] in created_keyspaces:
result.one.return_value = MagicMock() # Exists
else:
result.one.return_value = None # Doesn't exist
else:
result.one.return_value = None
return result
session.execute = MagicMock(side_effect=execute_mock)
return session
@pytest.fixture
def mock_cassandra_cluster(self, mock_cassandra_session):
"""Mock Cassandra cluster"""
cluster = MagicMock()
cluster.connect.return_value = mock_cassandra_session
cluster.shutdown = MagicMock()
return cluster
@pytest.fixture
def processor_with_mocks(self, mock_cassandra_cluster, mock_cassandra_session):
"""Create processor with mocked Cassandra dependencies"""
processor = MagicMock()
processor.cassandra_host = ["localhost"]
processor.cassandra_username = None
processor.cassandra_password = None
processor.config_key = "schema"
processor.schemas = {}
processor.known_keyspaces = set()
processor.tables_initialized = set()
processor.registered_partitions = set()
processor.cluster = None
processor.session = None
# Bind actual methods from the new unified table implementation
processor.connect_cassandra = Processor.connect_cassandra.__get__(processor, Processor)
processor.ensure_keyspace = Processor.ensure_keyspace.__get__(processor, Processor)
processor.ensure_tables = Processor.ensure_tables.__get__(processor, Processor)
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
processor.register_partitions = Processor.register_partitions.__get__(processor, Processor)
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_object = Processor.on_object.__get__(processor, Processor)
processor.collection_exists = MagicMock(return_value=True)
return processor, mock_cassandra_cluster, mock_cassandra_session
@pytest.mark.asyncio
async def test_end_to_end_object_storage(self, processor_with_mocks):
"""Test complete flow from schema config to object storage"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
# Step 1: Configure schema
config = {
"schema": {
"customer_records": json.dumps({
"name": "customer_records",
"description": "Customer information",
"fields": [
{"name": "customer_id", "type": "string", "primary_key": True},
{"name": "name", "type": "string", "required": True},
{"name": "email", "type": "string", "indexed": True},
{"name": "age", "type": "integer"}
]
})
}
}
await processor.on_schema_config("default", config, version=1)
assert "customer_records" in processor.schemas["default"]
# Step 2: Process an ExtractedObject
test_obj = ExtractedObject(
metadata=Metadata(
id="doc-001",
collection="import_2024",
),
schema_name="customer_records",
values=[{
"customer_id": "CUST001",
"name": "John Doe",
"email": "john@example.com",
"age": "30"
}],
confidence=0.95,
source_span="Customer: John Doe..."
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, mock_flow_default)
# Verify Cassandra interactions
assert mock_cluster.connect.called
# Verify keyspace creation
keyspace_calls = [call for call in mock_session.execute.call_args_list
if "CREATE KEYSPACE" in str(call)]
assert len(keyspace_calls) == 1
assert "default" in str(keyspace_calls[0])
# Verify unified table creation (rows table, not per-schema table)
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
assert len(table_calls) == 2 # rows table + row_partitions table
assert any("rows" in str(call) for call in table_calls)
assert any("row_partitions" in str(call) for call in table_calls)
# Verify the rows table has correct structure
rows_table_call = [call for call in table_calls if ".rows" in str(call)][0]
assert "collection text" in str(rows_table_call)
assert "schema_name text" in str(rows_table_call)
assert "index_name text" in str(rows_table_call)
assert "data map<text, text>" in str(rows_table_call)
# Verify data insertion into unified table
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
# Should have 2 data inserts: one for customer_id (primary), one for email (indexed)
assert len(rows_insert_calls) == 2
@pytest.mark.asyncio
async def test_multi_schema_handling(self, processor_with_mocks):
"""Test handling multiple schemas stored in unified table"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
# Configure multiple schemas
config = {
"schema": {
"products": json.dumps({
"name": "products",
"fields": [
{"name": "product_id", "type": "string", "primary_key": True},
{"name": "name", "type": "string"},
{"name": "price", "type": "float"}
]
}),
"orders": json.dumps({
"name": "orders",
"fields": [
{"name": "order_id", "type": "string", "primary_key": True},
{"name": "customer_id", "type": "string"},
{"name": "total", "type": "float"}
]
})
}
}
await processor.on_schema_config("default", config, version=1)
assert len(processor.schemas["default"]) == 2
# Process objects for different schemas
product_obj = ExtractedObject(
metadata=Metadata(id="p1", collection="catalog"),
schema_name="products",
values=[{"product_id": "P001", "name": "Widget", "price": "19.99"}],
confidence=0.9,
source_span="Product..."
)
order_obj = ExtractedObject(
metadata=Metadata(id="o1", collection="sales"),
schema_name="orders",
values=[{"order_id": "O001", "customer_id": "C001", "total": "59.97"}],
confidence=0.85,
source_span="Order..."
)
# Process both objects
for obj in [product_obj, order_obj]:
msg = MagicMock()
msg.value.return_value = obj
await processor.on_object(msg, None, mock_flow_default)
# All data goes into the same unified rows table
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
# Should only create 2 tables: rows + row_partitions (not per-schema tables)
assert len(table_calls) == 2
# Verify data inserts go to unified rows table
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
assert len(rows_insert_calls) > 0
for call in rows_insert_calls:
assert ".rows" in str(call)
@pytest.mark.asyncio
async def test_multi_index_storage(self, processor_with_mocks):
"""Test that rows are stored with multiple indexes"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
# Schema with multiple indexed fields
processor.schemas["default"] = {
"indexed_data": RowSchema(
name="indexed_data",
fields=[
Field(name="id", type="string", size=50, primary=True),
Field(name="category", type="string", size=50, indexed=True),
Field(name="status", type="string", size=50, indexed=True),
Field(name="description", type="string", size=200) # Not indexed
]
)
}
test_obj = ExtractedObject(
metadata=Metadata(id="t1", collection="test"),
schema_name="indexed_data",
values=[{
"id": "123",
"category": "electronics",
"status": "active",
"description": "A product"
}],
confidence=0.9,
source_span="Test"
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, mock_flow_default)
# Should have 3 data inserts (one per indexed field: id, category, status)
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
assert len(rows_insert_calls) == 3
# Verify different index names were used
index_names = set()
for call in rows_insert_calls:
values = call[0][1]
index_names.add(values[2]) # index_name is 3rd parameter
assert index_names == {"id", "category", "status"}
@pytest.mark.asyncio
async def test_authentication_handling(self, processor_with_mocks):
"""Test Cassandra authentication"""
processor, mock_cluster, mock_session = processor_with_mocks
processor.cassandra_username = "cassandra_user"
processor.cassandra_password = "cassandra_pass"
with patch('trustgraph.storage.rows.cassandra.write.Cluster') as mock_cluster_class:
with patch('trustgraph.storage.rows.cassandra.write.PlainTextAuthProvider') as mock_auth:
mock_cluster_class.return_value = mock_cluster
# Trigger connection
processor.connect_cassandra()
# Verify authentication was configured
mock_auth.assert_called_once_with(
username="cassandra_user",
password="cassandra_pass"
)
mock_cluster_class.assert_called_once()
call_kwargs = mock_cluster_class.call_args[1]
assert 'auth_provider' in call_kwargs
@pytest.mark.asyncio
async def test_batch_object_processing(self, processor_with_mocks):
"""Test processing objects with batched values"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
# Configure schema
config = {
"schema": {
"batch_customers": json.dumps({
"name": "batch_customers",
"description": "Customer batch data",
"fields": [
{"name": "customer_id", "type": "string", "primary_key": True},
{"name": "name", "type": "string", "required": True},
{"name": "email", "type": "string", "indexed": True}
]
})
}
}
await processor.on_schema_config("default", config, version=1)
# Process batch object with multiple values
batch_obj = ExtractedObject(
metadata=Metadata(
id="batch-001",
collection="batch_import",
),
schema_name="batch_customers",
values=[
{
"customer_id": "CUST001",
"name": "John Doe",
"email": "john@example.com"
},
{
"customer_id": "CUST002",
"name": "Jane Smith",
"email": "jane@example.com"
},
{
"customer_id": "CUST003",
"name": "Bob Johnson",
"email": "bob@example.com"
}
],
confidence=0.92,
source_span="Multiple customers extracted from document"
)
msg = MagicMock()
msg.value.return_value = batch_obj
await processor.on_object(msg, None, mock_flow_default)
# Verify unified table creation
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
assert len(table_calls) == 2 # rows + row_partitions
# Each row in batch gets 2 data inserts (customer_id primary + email indexed)
# 3 rows * 2 indexes = 6 data inserts
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
assert len(rows_insert_calls) == 6
@pytest.mark.asyncio
async def test_empty_batch_processing(self, processor_with_mocks):
"""Test processing objects with empty values array"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
processor.schemas["default"] = {
"empty_test": RowSchema(
name="empty_test",
fields=[Field(name="id", type="string", size=50, primary=True)]
)
}
# Process empty batch object
empty_obj = ExtractedObject(
metadata=Metadata(id="empty-1", collection="empty"),
schema_name="empty_test",
values=[], # Empty batch
confidence=1.0,
source_span="No objects found"
)
msg = MagicMock()
msg.value.return_value = empty_obj
await processor.on_object(msg, None, mock_flow_default)
# Should not create any data insert statements for empty batch
# (partition registration may still happen)
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
assert len(rows_insert_calls) == 0
@pytest.mark.asyncio
async def test_data_stored_as_map(self, processor_with_mocks):
"""Test that data is stored as map<text, text>"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
processor.schemas["default"] = {
"map_test": RowSchema(
name="map_test",
fields=[
Field(name="id", type="string", size=50, primary=True),
Field(name="name", type="string", size=100),
Field(name="count", type="integer", size=0)
]
)
}
test_obj = ExtractedObject(
metadata=Metadata(id="t1", collection="test"),
schema_name="map_test",
values=[{"id": "123", "name": "Test Item", "count": "42"}],
confidence=0.9,
source_span="Test"
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, mock_flow_default)
# Verify insert uses map for data
rows_insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and ".rows" in str(call)
and "row_partitions" not in str(call)]
assert len(rows_insert_calls) >= 1
# Check that data is passed as a dict (will be map in Cassandra)
insert_call = rows_insert_calls[0]
values = insert_call[0][1]
# Values are: (collection, schema_name, index_name, index_value, data, source)
# values[4] should be the data map
data_map = values[4]
assert isinstance(data_map, dict)
assert data_map["id"] == "123"
assert data_map["name"] == "Test Item"
assert data_map["count"] == "42"
@pytest.mark.asyncio
async def test_partition_registration(self, processor_with_mocks):
"""Test that partitions are registered for efficient querying"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.rows.cassandra.write.Cluster', return_value=mock_cluster):
processor.schemas["default"] = {
"partition_test": RowSchema(
name="partition_test",
fields=[
Field(name="id", type="string", size=50, primary=True),
Field(name="category", type="string", size=50, indexed=True)
]
)
}
test_obj = ExtractedObject(
metadata=Metadata(id="t1", collection="my_collection"),
schema_name="partition_test",
values=[{"id": "123", "category": "test"}],
confidence=0.9,
source_span="Test"
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, mock_flow_default)
# Verify partition registration
partition_inserts = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call) and "row_partitions" in str(call)]
# Should register partitions for each index (id, category)
assert len(partition_inserts) == 2
# Verify cache was updated
assert ("my_collection", "partition_test") in processor.registered_partitions