Structure data mvp (#452)

* Structured data tech spec

* Architecture principles

* New schemas

* Updated schemas and specs

* Object extractor

* Add .coveragerc

* New tests

* Cassandra object storage

* Trying to object extraction working, issues exist
This commit is contained in:
cybermaggedon 2025-08-07 20:47:20 +01:00 committed by GitHub
parent 5de56c5dbc
commit 83f0c1e7f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 5313 additions and 1629 deletions

View file

@ -8,7 +8,6 @@ Following the TEST_STRATEGY.md approach for integration testing.
import pytest
from unittest.mock import AsyncMock, MagicMock
from testcontainers.compose import DockerCompose
from trustgraph.retrieval.document_rag.document_rag import DocumentRag

View file

@ -0,0 +1,540 @@
"""
Integration tests for Object Extraction Service
These tests verify the end-to-end functionality of the object extraction service,
testing configuration management, text-to-object transformation, and service coordination.
Following the TEST_STRATEGY.md approach for integration testing.
"""
import pytest
import json
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.objects.processor import Processor
from trustgraph.schema import (
Chunk, ExtractedObject, Metadata, RowSchema, Field,
PromptRequest, PromptResponse
)
@pytest.mark.integration
class TestObjectExtractionServiceIntegration:
"""Integration tests for Object Extraction Service"""
@pytest.fixture
def integration_config(self):
"""Integration test configuration with multiple schemas"""
customer_schema = {
"name": "customer_records",
"description": "Customer information schema",
"fields": [
{
"name": "customer_id",
"type": "string",
"primary_key": True,
"required": True,
"indexed": True,
"description": "Unique customer identifier"
},
{
"name": "name",
"type": "string",
"required": True,
"description": "Customer full name"
},
{
"name": "email",
"type": "string",
"required": True,
"indexed": True,
"description": "Customer email address"
},
{
"name": "phone",
"type": "string",
"required": False,
"description": "Customer phone number"
}
]
}
product_schema = {
"name": "product_catalog",
"description": "Product catalog schema",
"fields": [
{
"name": "product_id",
"type": "string",
"primary_key": True,
"required": True,
"indexed": True,
"description": "Unique product identifier"
},
{
"name": "name",
"type": "string",
"required": True,
"description": "Product name"
},
{
"name": "price",
"type": "double",
"required": True,
"description": "Product price"
},
{
"name": "category",
"type": "string",
"required": False,
"enum": ["electronics", "clothing", "books", "home"],
"description": "Product category"
}
]
}
return {
"schema": {
"customer_records": json.dumps(customer_schema),
"product_catalog": json.dumps(product_schema)
}
}
@pytest.fixture
def mock_integrated_flow(self):
"""Mock integrated flow context with realistic prompt responses"""
context = MagicMock()
# Mock prompt client with realistic responses
prompt_client = AsyncMock()
def mock_extract_objects(schema, text):
"""Mock extract_objects with schema-aware responses"""
# Schema is now a dict (converted by row_schema_translator)
schema_name = schema.get("name") if isinstance(schema, dict) else schema.name
if schema_name == "customer_records":
if "john" in text.lower():
return [
{
"customer_id": "CUST001",
"name": "John Smith",
"email": "john.smith@email.com",
"phone": "555-0123"
}
]
elif "jane" in text.lower():
return [
{
"customer_id": "CUST002",
"name": "Jane Doe",
"email": "jane.doe@email.com",
"phone": ""
}
]
else:
return []
elif schema_name == "product_catalog":
if "laptop" in text.lower():
return [
{
"product_id": "PROD001",
"name": "Gaming Laptop",
"price": "1299.99",
"category": "electronics"
}
]
elif "book" in text.lower():
return [
{
"product_id": "PROD002",
"name": "Python Programming Guide",
"price": "49.99",
"category": "books"
}
]
else:
return []
return []
prompt_client.extract_objects.side_effect = mock_extract_objects
# Mock output producer
output_producer = AsyncMock()
def context_router(service_name):
if service_name == "prompt-request":
return prompt_client
elif service_name == "output":
return output_producer
else:
return AsyncMock()
context.side_effect = context_router
return context
@pytest.mark.asyncio
async def test_multi_schema_configuration_integration(self, integration_config):
"""Test integration with multiple schema configurations"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
# Act
await processor.on_schema_config(integration_config, version=1)
# Assert
assert len(processor.schemas) == 2
assert "customer_records" in processor.schemas
assert "product_catalog" in processor.schemas
# Verify customer schema
customer_schema = processor.schemas["customer_records"]
assert customer_schema.name == "customer_records"
assert len(customer_schema.fields) == 4
# Verify product schema
product_schema = processor.schemas["product_catalog"]
assert product_schema.name == "product_catalog"
assert len(product_schema.fields) == 4
# Check enum field in product schema
category_field = next((f for f in product_schema.fields if f.name == "category"), None)
assert category_field is not None
assert len(category_field.enum_values) == 4
assert "electronics" in category_field.enum_values
@pytest.mark.asyncio
async def test_full_service_integration_customer_extraction(self, integration_config, mock_integrated_flow):
"""Test full service integration for customer data extraction"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.flow = mock_integrated_flow
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
# Import and bind the convert_values_to_strings function
from trustgraph.extract.kg.objects.processor import convert_values_to_strings
processor.convert_values_to_strings = convert_values_to_strings
# Load configuration
await processor.on_schema_config(integration_config, version=1)
# Create realistic customer data chunk
metadata = Metadata(
id="customer-doc-001",
user="integration_test",
collection="test_documents",
metadata=[]
)
chunk_text = """
Customer Registration Form
Name: John Smith
Email: john.smith@email.com
Phone: 555-0123
Customer ID: CUST001
Registration completed successfully.
"""
chunk = Chunk(metadata=metadata, chunk=chunk_text.encode('utf-8'))
# Mock message
mock_msg = MagicMock()
mock_msg.value.return_value = chunk
# Act
await processor.on_chunk(mock_msg, None, mock_integrated_flow)
# Assert
output_producer = mock_integrated_flow("output")
# Should have calls for both schemas (even if one returns empty)
assert output_producer.send.call_count >= 1
# Find customer extraction
customer_calls = []
for call in output_producer.send.call_args_list:
extracted_obj = call[0][0]
if extracted_obj.schema_name == "customer_records":
customer_calls.append(extracted_obj)
assert len(customer_calls) == 1
customer_obj = customer_calls[0]
assert customer_obj.values["customer_id"] == "CUST001"
assert customer_obj.values["name"] == "John Smith"
assert customer_obj.values["email"] == "john.smith@email.com"
assert customer_obj.confidence > 0.5
@pytest.mark.asyncio
async def test_full_service_integration_product_extraction(self, integration_config, mock_integrated_flow):
"""Test full service integration for product data extraction"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.flow = mock_integrated_flow
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
# Import and bind the convert_values_to_strings function
from trustgraph.extract.kg.objects.processor import convert_values_to_strings
processor.convert_values_to_strings = convert_values_to_strings
# Load configuration
await processor.on_schema_config(integration_config, version=1)
# Create realistic product data chunk
metadata = Metadata(
id="product-doc-001",
user="integration_test",
collection="test_documents",
metadata=[]
)
chunk_text = """
Product Specification Sheet
Product Name: Gaming Laptop
Product ID: PROD001
Price: $1,299.99
Category: Electronics
High-performance gaming laptop with latest specifications.
"""
chunk = Chunk(metadata=metadata, chunk=chunk_text.encode('utf-8'))
# Mock message
mock_msg = MagicMock()
mock_msg.value.return_value = chunk
# Act
await processor.on_chunk(mock_msg, None, mock_integrated_flow)
# Assert
output_producer = mock_integrated_flow("output")
# Find product extraction
product_calls = []
for call in output_producer.send.call_args_list:
extracted_obj = call[0][0]
if extracted_obj.schema_name == "product_catalog":
product_calls.append(extracted_obj)
assert len(product_calls) == 1
product_obj = product_calls[0]
assert product_obj.values["product_id"] == "PROD001"
assert product_obj.values["name"] == "Gaming Laptop"
assert product_obj.values["price"] == "1299.99"
assert product_obj.values["category"] == "electronics"
@pytest.mark.asyncio
async def test_concurrent_extraction_integration(self, integration_config, mock_integrated_flow):
"""Test concurrent processing of multiple chunks"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.flow = mock_integrated_flow
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
# Import and bind the convert_values_to_strings function
from trustgraph.extract.kg.objects.processor import convert_values_to_strings
processor.convert_values_to_strings = convert_values_to_strings
# Load configuration
await processor.on_schema_config(integration_config, version=1)
# Create multiple test chunks
chunks_data = [
("customer-chunk-1", "Customer: John Smith, email: john.smith@email.com, ID: CUST001"),
("customer-chunk-2", "Customer: Jane Doe, email: jane.doe@email.com, ID: CUST002"),
("product-chunk-1", "Product: Gaming Laptop, ID: PROD001, Price: $1299.99, Category: electronics"),
("product-chunk-2", "Product: Python Programming Guide, ID: PROD002, Price: $49.99, Category: books")
]
chunks = []
for chunk_id, text in chunks_data:
metadata = Metadata(
id=chunk_id,
user="concurrent_test",
collection="test_collection",
metadata=[]
)
chunk = Chunk(metadata=metadata, chunk=text.encode('utf-8'))
chunks.append(chunk)
# Act - Process chunks concurrently
tasks = []
for chunk in chunks:
mock_msg = MagicMock()
mock_msg.value.return_value = chunk
task = processor.on_chunk(mock_msg, None, mock_integrated_flow)
tasks.append(task)
await asyncio.gather(*tasks)
# Assert
output_producer = mock_integrated_flow("output")
# Should have processed all chunks (some may produce objects, some may not)
assert output_producer.send.call_count >= 2 # At least customer and product extractions
# Verify we got both types of objects
extracted_objects = []
for call in output_producer.send.call_args_list:
extracted_objects.append(call[0][0])
customer_objects = [obj for obj in extracted_objects if obj.schema_name == "customer_records"]
product_objects = [obj for obj in extracted_objects if obj.schema_name == "product_catalog"]
assert len(customer_objects) >= 1
assert len(product_objects) >= 1
@pytest.mark.asyncio
async def test_configuration_reload_integration(self, integration_config, mock_integrated_flow):
"""Test configuration reload during service operation"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.flow = mock_integrated_flow
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
# Load initial configuration (only customer schema)
initial_config = {
"schema": {
"customer_records": integration_config["schema"]["customer_records"]
}
}
await processor.on_schema_config(initial_config, version=1)
assert len(processor.schemas) == 1
assert "customer_records" in processor.schemas
assert "product_catalog" not in processor.schemas
# Act - Reload with full configuration
await processor.on_schema_config(integration_config, version=2)
# Assert
assert len(processor.schemas) == 2
assert "customer_records" in processor.schemas
assert "product_catalog" in processor.schemas
@pytest.mark.asyncio
async def test_error_resilience_integration(self, integration_config):
"""Test service resilience to various error conditions"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
# Import and bind the convert_values_to_strings function
from trustgraph.extract.kg.objects.processor import convert_values_to_strings
processor.convert_values_to_strings = convert_values_to_strings
# Mock flow with failing prompt service
failing_flow = MagicMock()
failing_prompt = AsyncMock()
failing_prompt.extract_rows.side_effect = Exception("Prompt service unavailable")
def failing_context_router(service_name):
if service_name == "prompt-request":
return failing_prompt
elif service_name == "output":
return AsyncMock()
else:
return AsyncMock()
failing_flow.side_effect = failing_context_router
processor.flow = failing_flow
# Load configuration
await processor.on_schema_config(integration_config, version=1)
# Create test chunk
metadata = Metadata(id="error-test", user="test", collection="test", metadata=[])
chunk = Chunk(metadata=metadata, chunk=b"Some text that will fail to process")
mock_msg = MagicMock()
mock_msg.value.return_value = chunk
# Act & Assert - Should not raise exception
try:
await processor.on_chunk(mock_msg, None, failing_flow)
# Should complete without throwing exception
except Exception as e:
pytest.fail(f"Service should handle errors gracefully, but raised: {e}")
@pytest.mark.asyncio
async def test_metadata_propagation_integration(self, integration_config, mock_integrated_flow):
"""Test proper metadata propagation through extraction pipeline"""
# Arrange - Create mock processor with actual methods
processor = MagicMock()
processor.schemas = {}
processor.config_key = "schema"
processor.flow = mock_integrated_flow
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
# Import and bind the convert_values_to_strings function
from trustgraph.extract.kg.objects.processor import convert_values_to_strings
processor.convert_values_to_strings = convert_values_to_strings
# Load configuration
await processor.on_schema_config(integration_config, version=1)
# Create chunk with rich metadata
original_metadata = Metadata(
id="metadata-test-chunk",
user="test_user",
collection="test_collection",
metadata=[] # Could include source document metadata
)
chunk = Chunk(
metadata=original_metadata,
chunk=b"Customer: John Smith, ID: CUST001, email: john.smith@email.com"
)
mock_msg = MagicMock()
mock_msg.value.return_value = chunk
# Act
await processor.on_chunk(mock_msg, None, mock_integrated_flow)
# Assert
output_producer = mock_integrated_flow("output")
# Find extracted object
extracted_obj = None
for call in output_producer.send.call_args_list:
obj = call[0][0]
if obj.schema_name == "customer_records":
extracted_obj = obj
break
assert extracted_obj is not None
# Verify metadata propagation
assert extracted_obj.metadata.user == "test_user"
assert extracted_obj.metadata.collection == "test_collection"
assert "metadata-test-chunk" in extracted_obj.metadata.id # Should include source reference

View file

@ -0,0 +1,384 @@
"""
Integration tests for Cassandra Object Storage
These tests verify the end-to-end functionality of storing ExtractedObjects
in Cassandra, including table creation, data insertion, and error handling.
"""
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
import json
import uuid
from trustgraph.storage.objects.cassandra.write import Processor
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
@pytest.mark.integration
class TestObjectsCassandraIntegration:
"""Integration tests for Cassandra object storage"""
@pytest.fixture
def mock_cassandra_session(self):
"""Mock Cassandra session for integration tests"""
session = MagicMock()
session.execute = MagicMock()
return session
@pytest.fixture
def mock_cassandra_cluster(self, mock_cassandra_session):
"""Mock Cassandra cluster"""
cluster = MagicMock()
cluster.connect.return_value = mock_cassandra_session
cluster.shutdown = MagicMock()
return cluster
@pytest.fixture
def processor_with_mocks(self, mock_cassandra_cluster, mock_cassandra_session):
"""Create processor with mocked Cassandra dependencies"""
processor = MagicMock()
processor.graph_host = "localhost"
processor.graph_username = None
processor.graph_password = None
processor.config_key = "schema"
processor.schemas = {}
processor.known_keyspaces = set()
processor.known_tables = {}
processor.cluster = None
processor.session = None
# Bind actual methods
processor.connect_cassandra = Processor.connect_cassandra.__get__(processor, Processor)
processor.ensure_keyspace = Processor.ensure_keyspace.__get__(processor, Processor)
processor.ensure_table = Processor.ensure_table.__get__(processor, Processor)
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
processor.on_object = Processor.on_object.__get__(processor, Processor)
return processor, mock_cassandra_cluster, mock_cassandra_session
@pytest.mark.asyncio
async def test_end_to_end_object_storage(self, processor_with_mocks):
"""Test complete flow from schema config to object storage"""
processor, mock_cluster, mock_session = processor_with_mocks
# Mock Cluster creation
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
# Step 1: Configure schema
config = {
"schema": {
"customer_records": json.dumps({
"name": "customer_records",
"description": "Customer information",
"fields": [
{"name": "customer_id", "type": "string", "primary_key": True},
{"name": "name", "type": "string", "required": True},
{"name": "email", "type": "string", "indexed": True},
{"name": "age", "type": "integer"}
]
})
}
}
await processor.on_schema_config(config, version=1)
assert "customer_records" in processor.schemas
# Step 2: Process an ExtractedObject
test_obj = ExtractedObject(
metadata=Metadata(
id="doc-001",
user="test_user",
collection="import_2024",
metadata=[]
),
schema_name="customer_records",
values={
"customer_id": "CUST001",
"name": "John Doe",
"email": "john@example.com",
"age": "30"
},
confidence=0.95,
source_span="Customer: John Doe..."
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, None)
# Verify Cassandra interactions
assert mock_cluster.connect.called
# Verify keyspace creation
keyspace_calls = [call for call in mock_session.execute.call_args_list
if "CREATE KEYSPACE" in str(call)]
assert len(keyspace_calls) == 1
assert "test_user" in str(keyspace_calls[0])
# Verify table creation
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
assert len(table_calls) == 1
assert "o_customer_records" in str(table_calls[0]) # Table gets o_ prefix
assert "collection text" in str(table_calls[0])
assert "PRIMARY KEY ((collection, customer_id))" in str(table_calls[0])
# Verify index creation
index_calls = [call for call in mock_session.execute.call_args_list
if "CREATE INDEX" in str(call)]
assert len(index_calls) == 1
assert "email" in str(index_calls[0])
# Verify data insertion
insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call)]
assert len(insert_calls) == 1
insert_call = insert_calls[0]
assert "test_user.o_customer_records" in str(insert_call) # Table gets o_ prefix
# Check inserted values
values = insert_call[0][1]
assert "import_2024" in values # collection
assert "CUST001" in values # customer_id
assert "John Doe" in values # name
assert "john@example.com" in values # email
assert 30 in values # age (converted to int)
@pytest.mark.asyncio
async def test_multi_schema_handling(self, processor_with_mocks):
"""Test handling multiple schemas and objects"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
# Configure multiple schemas
config = {
"schema": {
"products": json.dumps({
"name": "products",
"fields": [
{"name": "product_id", "type": "string", "primary_key": True},
{"name": "name", "type": "string"},
{"name": "price", "type": "float"}
]
}),
"orders": json.dumps({
"name": "orders",
"fields": [
{"name": "order_id", "type": "string", "primary_key": True},
{"name": "customer_id", "type": "string"},
{"name": "total", "type": "float"}
]
})
}
}
await processor.on_schema_config(config, version=1)
assert len(processor.schemas) == 2
# Process objects for different schemas
product_obj = ExtractedObject(
metadata=Metadata(id="p1", user="shop", collection="catalog", metadata=[]),
schema_name="products",
values={"product_id": "P001", "name": "Widget", "price": "19.99"},
confidence=0.9,
source_span="Product..."
)
order_obj = ExtractedObject(
metadata=Metadata(id="o1", user="shop", collection="sales", metadata=[]),
schema_name="orders",
values={"order_id": "O001", "customer_id": "C001", "total": "59.97"},
confidence=0.85,
source_span="Order..."
)
# Process both objects
for obj in [product_obj, order_obj]:
msg = MagicMock()
msg.value.return_value = obj
await processor.on_object(msg, None, None)
# Verify separate tables were created
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
assert len(table_calls) == 2
assert any("o_products" in str(call) for call in table_calls) # Tables get o_ prefix
assert any("o_orders" in str(call) for call in table_calls) # Tables get o_ prefix
@pytest.mark.asyncio
async def test_missing_required_fields(self, processor_with_mocks):
"""Test handling of objects with missing required fields"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
# Configure schema with required field
processor.schemas["test_schema"] = RowSchema(
name="test_schema",
description="Test",
fields=[
Field(name="id", type="string", size=50, primary=True, required=True),
Field(name="required_field", type="string", size=100, required=True)
]
)
# Create object missing required field
test_obj = ExtractedObject(
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
schema_name="test_schema",
values={"id": "123"}, # missing required_field
confidence=0.8,
source_span="Test"
)
msg = MagicMock()
msg.value.return_value = test_obj
# Should still process (Cassandra doesn't enforce NOT NULL)
await processor.on_object(msg, None, None)
# Verify insert was attempted
insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call)]
assert len(insert_calls) == 1
@pytest.mark.asyncio
async def test_schema_without_primary_key(self, processor_with_mocks):
"""Test handling schemas without defined primary keys"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
# Configure schema without primary key
processor.schemas["events"] = RowSchema(
name="events",
description="Event log",
fields=[
Field(name="event_type", type="string", size=50),
Field(name="timestamp", type="timestamp", size=0)
]
)
# Process object
test_obj = ExtractedObject(
metadata=Metadata(id="e1", user="logger", collection="app_events", metadata=[]),
schema_name="events",
values={"event_type": "login", "timestamp": "2024-01-01T10:00:00Z"},
confidence=1.0,
source_span="Event"
)
msg = MagicMock()
msg.value.return_value = test_obj
await processor.on_object(msg, None, None)
# Verify synthetic_id was added
table_calls = [call for call in mock_session.execute.call_args_list
if "CREATE TABLE" in str(call)]
assert len(table_calls) == 1
assert "synthetic_id uuid" in str(table_calls[0])
# Verify insert includes UUID
insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call)]
assert len(insert_calls) == 1
values = insert_calls[0][0][1]
# Check that a UUID was generated (will be in values list)
uuid_found = any(isinstance(v, uuid.UUID) for v in values)
assert uuid_found
@pytest.mark.asyncio
async def test_authentication_handling(self, processor_with_mocks):
"""Test Cassandra authentication"""
processor, mock_cluster, mock_session = processor_with_mocks
processor.graph_username = "cassandra_user"
processor.graph_password = "cassandra_pass"
with patch('trustgraph.storage.objects.cassandra.write.Cluster') as mock_cluster_class:
with patch('trustgraph.storage.objects.cassandra.write.PlainTextAuthProvider') as mock_auth:
mock_cluster_class.return_value = mock_cluster
# Trigger connection
processor.connect_cassandra()
# Verify authentication was configured
mock_auth.assert_called_once_with(
username="cassandra_user",
password="cassandra_pass"
)
mock_cluster_class.assert_called_once()
call_kwargs = mock_cluster_class.call_args[1]
assert 'auth_provider' in call_kwargs
@pytest.mark.asyncio
async def test_error_handling_during_insert(self, processor_with_mocks):
"""Test error handling when insertion fails"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
processor.schemas["test"] = RowSchema(
name="test",
fields=[Field(name="id", type="string", size=50, primary=True)]
)
# Make insert fail
mock_session.execute.side_effect = [
None, # keyspace creation succeeds
None, # table creation succeeds
Exception("Connection timeout") # insert fails
]
test_obj = ExtractedObject(
metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
schema_name="test",
values={"id": "123"},
confidence=0.9,
source_span="Test"
)
msg = MagicMock()
msg.value.return_value = test_obj
# Should raise the exception
with pytest.raises(Exception, match="Connection timeout"):
await processor.on_object(msg, None, None)
@pytest.mark.asyncio
async def test_collection_partitioning(self, processor_with_mocks):
"""Test that objects are properly partitioned by collection"""
processor, mock_cluster, mock_session = processor_with_mocks
with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
processor.schemas["data"] = RowSchema(
name="data",
fields=[Field(name="id", type="string", size=50, primary=True)]
)
# Process objects from different collections
collections = ["import_jan", "import_feb", "import_mar"]
for coll in collections:
obj = ExtractedObject(
metadata=Metadata(id=f"{coll}-1", user="analytics", collection=coll, metadata=[]),
schema_name="data",
values={"id": f"ID-{coll}"},
confidence=0.9,
source_span="Data"
)
msg = MagicMock()
msg.value.return_value = obj
await processor.on_object(msg, None, None)
# Verify all inserts include collection in values
insert_calls = [call for call in mock_session.execute.call_args_list
if "INSERT INTO" in str(call)]
assert len(insert_calls) == 3
# Check each insert has the correct collection
for i, call in enumerate(insert_calls):
values = call[0][1]
assert collections[i] in values