Structure data mvp (#452)

* Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist
2026-05-11 16:22:37 +02:00 · 2025-08-07 20:47:20 +01:00 · 2025-08-07 20:47:20 +01:00 · 83f0c1e7f3
commit 83f0c1e7f3
parent 5de56c5dbc
46 changed files with 5313 additions and 1629 deletions
--- a/tests/integration/test_document_rag_integration.py
+++ b/tests/integration/test_document_rag_integration.py
@ -8,7 +8,6 @@ Following the TEST_STRATEGY.md approach for integration testing.

 import pytest
 from unittest.mock import AsyncMock, MagicMock
-from testcontainers.compose import DockerCompose
 from trustgraph.retrieval.document_rag.document_rag import DocumentRag


--- a/tests/integration/test_object_extraction_integration.py
+++ b/tests/integration/test_object_extraction_integration.py
@ -0,0 +1,540 @@
+"""
+Integration tests for Object Extraction Service
+
+These tests verify the end-to-end functionality of the object extraction service,
+testing configuration management, text-to-object transformation, and service coordination.
+Following the TEST_STRATEGY.md approach for integration testing.
+"""
+
+import pytest
+import json
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from trustgraph.extract.kg.objects.processor import Processor
+from trustgraph.schema import (
+    Chunk, ExtractedObject, Metadata, RowSchema, Field,
+    PromptRequest, PromptResponse
+)
+
+
+@pytest.mark.integration
+class TestObjectExtractionServiceIntegration:
+    """Integration tests for Object Extraction Service"""
+
+    @pytest.fixture
+    def integration_config(self):
+        """Integration test configuration with multiple schemas"""
+        customer_schema = {
+            "name": "customer_records",
+            "description": "Customer information schema",
+            "fields": [
+                {
+                    "name": "customer_id",
+                    "type": "string", 
+                    "primary_key": True,
+                    "required": True,
+                    "indexed": True,
+                    "description": "Unique customer identifier"
+                },
+                {
+                    "name": "name",
+                    "type": "string",
+                    "required": True,
+                    "description": "Customer full name"
+                },
+                {
+                    "name": "email",
+                    "type": "string",
+                    "required": True,
+                    "indexed": True,
+                    "description": "Customer email address"
+                },
+                {
+                    "name": "phone",
+                    "type": "string", 
+                    "required": False,
+                    "description": "Customer phone number"
+                }
+            ]
+        }
+        
+        product_schema = {
+            "name": "product_catalog",
+            "description": "Product catalog schema",
+            "fields": [
+                {
+                    "name": "product_id",
+                    "type": "string",
+                    "primary_key": True,
+                    "required": True,
+                    "indexed": True,
+                    "description": "Unique product identifier"
+                },
+                {
+                    "name": "name",
+                    "type": "string",
+                    "required": True,
+                    "description": "Product name"
+                },
+                {
+                    "name": "price",
+                    "type": "double",
+                    "required": True,
+                    "description": "Product price"
+                },
+                {
+                    "name": "category",
+                    "type": "string",
+                    "required": False,
+                    "enum": ["electronics", "clothing", "books", "home"],
+                    "description": "Product category"
+                }
+            ]
+        }
+        
+        return {
+            "schema": {
+                "customer_records": json.dumps(customer_schema),
+                "product_catalog": json.dumps(product_schema)
+            }
+        }
+
+    @pytest.fixture
+    def mock_integrated_flow(self):
+        """Mock integrated flow context with realistic prompt responses"""
+        context = MagicMock()
+        
+        # Mock prompt client with realistic responses
+        prompt_client = AsyncMock()
+        
+        def mock_extract_objects(schema, text):
+            """Mock extract_objects with schema-aware responses"""
+            # Schema is now a dict (converted by row_schema_translator)
+            schema_name = schema.get("name") if isinstance(schema, dict) else schema.name
+            if schema_name == "customer_records":
+                if "john" in text.lower():
+                    return [
+                        {
+                            "customer_id": "CUST001",
+                            "name": "John Smith", 
+                            "email": "john.smith@email.com",
+                            "phone": "555-0123"
+                        }
+                    ]
+                elif "jane" in text.lower():
+                    return [
+                        {
+                            "customer_id": "CUST002",
+                            "name": "Jane Doe",
+                            "email": "jane.doe@email.com",
+                            "phone": ""
+                        }
+                    ]
+                else:
+                    return []
+            
+            elif schema_name == "product_catalog":
+                if "laptop" in text.lower():
+                    return [
+                        {
+                            "product_id": "PROD001",
+                            "name": "Gaming Laptop",
+                            "price": "1299.99",
+                            "category": "electronics"
+                        }
+                    ]
+                elif "book" in text.lower():
+                    return [
+                        {
+                            "product_id": "PROD002", 
+                            "name": "Python Programming Guide",
+                            "price": "49.99",
+                            "category": "books"
+                        }
+                    ]
+                else:
+                    return []
+            
+            return []
+        
+        prompt_client.extract_objects.side_effect = mock_extract_objects
+        
+        # Mock output producer
+        output_producer = AsyncMock()
+        
+        def context_router(service_name):
+            if service_name == "prompt-request":
+                return prompt_client
+            elif service_name == "output":
+                return output_producer
+            else:
+                return AsyncMock()
+        
+        context.side_effect = context_router
+        return context
+
+    @pytest.mark.asyncio
+    async def test_multi_schema_configuration_integration(self, integration_config):
+        """Test integration with multiple schema configurations"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        
+        # Act
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Assert
+        assert len(processor.schemas) == 2
+        assert "customer_records" in processor.schemas
+        assert "product_catalog" in processor.schemas
+        
+        # Verify customer schema
+        customer_schema = processor.schemas["customer_records"]
+        assert customer_schema.name == "customer_records"
+        assert len(customer_schema.fields) == 4
+        
+        # Verify product schema
+        product_schema = processor.schemas["product_catalog"]
+        assert product_schema.name == "product_catalog"
+        assert len(product_schema.fields) == 4
+        
+        # Check enum field in product schema
+        category_field = next((f for f in product_schema.fields if f.name == "category"), None)
+        assert category_field is not None
+        assert len(category_field.enum_values) == 4
+        assert "electronics" in category_field.enum_values
+
+    @pytest.mark.asyncio
+    async def test_full_service_integration_customer_extraction(self, integration_config, mock_integrated_flow):
+        """Test full service integration for customer data extraction"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.flow = mock_integrated_flow
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
+        processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
+        
+        # Import and bind the convert_values_to_strings function
+        from trustgraph.extract.kg.objects.processor import convert_values_to_strings
+        processor.convert_values_to_strings = convert_values_to_strings
+        
+        # Load configuration
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Create realistic customer data chunk
+        metadata = Metadata(
+            id="customer-doc-001",
+            user="integration_test",
+            collection="test_documents",
+            metadata=[]
+        )
+        
+        chunk_text = """
+        Customer Registration Form
+        
+        Name: John Smith
+        Email: john.smith@email.com
+        Phone: 555-0123
+        Customer ID: CUST001
+        
+        Registration completed successfully.
+        """
+        
+        chunk = Chunk(metadata=metadata, chunk=chunk_text.encode('utf-8'))
+        
+        # Mock message
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = chunk
+        
+        # Act
+        await processor.on_chunk(mock_msg, None, mock_integrated_flow)
+        
+        # Assert
+        output_producer = mock_integrated_flow("output")
+        
+        # Should have calls for both schemas (even if one returns empty)
+        assert output_producer.send.call_count >= 1
+        
+        # Find customer extraction
+        customer_calls = []
+        for call in output_producer.send.call_args_list:
+            extracted_obj = call[0][0]
+            if extracted_obj.schema_name == "customer_records":
+                customer_calls.append(extracted_obj)
+        
+        assert len(customer_calls) == 1
+        customer_obj = customer_calls[0]
+        
+        assert customer_obj.values["customer_id"] == "CUST001"
+        assert customer_obj.values["name"] == "John Smith"
+        assert customer_obj.values["email"] == "john.smith@email.com"
+        assert customer_obj.confidence > 0.5
+
+    @pytest.mark.asyncio
+    async def test_full_service_integration_product_extraction(self, integration_config, mock_integrated_flow):
+        """Test full service integration for product data extraction"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.flow = mock_integrated_flow
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
+        processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
+        
+        # Import and bind the convert_values_to_strings function
+        from trustgraph.extract.kg.objects.processor import convert_values_to_strings
+        processor.convert_values_to_strings = convert_values_to_strings
+        
+        # Load configuration
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Create realistic product data chunk
+        metadata = Metadata(
+            id="product-doc-001",
+            user="integration_test",
+            collection="test_documents",
+            metadata=[]
+        )
+        
+        chunk_text = """
+        Product Specification Sheet
+        
+        Product Name: Gaming Laptop
+        Product ID: PROD001
+        Price: $1,299.99
+        Category: Electronics
+        
+        High-performance gaming laptop with latest specifications.
+        """
+        
+        chunk = Chunk(metadata=metadata, chunk=chunk_text.encode('utf-8'))
+        
+        # Mock message
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = chunk
+        
+        # Act
+        await processor.on_chunk(mock_msg, None, mock_integrated_flow)
+        
+        # Assert
+        output_producer = mock_integrated_flow("output")
+        
+        # Find product extraction
+        product_calls = []
+        for call in output_producer.send.call_args_list:
+            extracted_obj = call[0][0]
+            if extracted_obj.schema_name == "product_catalog":
+                product_calls.append(extracted_obj)
+        
+        assert len(product_calls) == 1
+        product_obj = product_calls[0]
+        
+        assert product_obj.values["product_id"] == "PROD001"
+        assert product_obj.values["name"] == "Gaming Laptop"
+        assert product_obj.values["price"] == "1299.99"
+        assert product_obj.values["category"] == "electronics"
+
+    @pytest.mark.asyncio
+    async def test_concurrent_extraction_integration(self, integration_config, mock_integrated_flow):
+        """Test concurrent processing of multiple chunks"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.flow = mock_integrated_flow
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
+        processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
+        
+        # Import and bind the convert_values_to_strings function
+        from trustgraph.extract.kg.objects.processor import convert_values_to_strings
+        processor.convert_values_to_strings = convert_values_to_strings
+        
+        # Load configuration
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Create multiple test chunks
+        chunks_data = [
+            ("customer-chunk-1", "Customer: John Smith, email: john.smith@email.com, ID: CUST001"),
+            ("customer-chunk-2", "Customer: Jane Doe, email: jane.doe@email.com, ID: CUST002"),
+            ("product-chunk-1", "Product: Gaming Laptop, ID: PROD001, Price: $1299.99, Category: electronics"),
+            ("product-chunk-2", "Product: Python Programming Guide, ID: PROD002, Price: $49.99, Category: books")
+        ]
+        
+        chunks = []
+        for chunk_id, text in chunks_data:
+            metadata = Metadata(
+                id=chunk_id,
+                user="concurrent_test",
+                collection="test_collection",
+                metadata=[]
+            )
+            chunk = Chunk(metadata=metadata, chunk=text.encode('utf-8'))
+            chunks.append(chunk)
+        
+        # Act - Process chunks concurrently
+        tasks = []
+        for chunk in chunks:
+            mock_msg = MagicMock()
+            mock_msg.value.return_value = chunk
+            task = processor.on_chunk(mock_msg, None, mock_integrated_flow)
+            tasks.append(task)
+        
+        await asyncio.gather(*tasks)
+        
+        # Assert
+        output_producer = mock_integrated_flow("output")
+        
+        # Should have processed all chunks (some may produce objects, some may not)
+        assert output_producer.send.call_count >= 2  # At least customer and product extractions
+        
+        # Verify we got both types of objects
+        extracted_objects = []
+        for call in output_producer.send.call_args_list:
+            extracted_objects.append(call[0][0])
+        
+        customer_objects = [obj for obj in extracted_objects if obj.schema_name == "customer_records"]
+        product_objects = [obj for obj in extracted_objects if obj.schema_name == "product_catalog"]
+        
+        assert len(customer_objects) >= 1
+        assert len(product_objects) >= 1
+
+    @pytest.mark.asyncio
+    async def test_configuration_reload_integration(self, integration_config, mock_integrated_flow):
+        """Test configuration reload during service operation"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.flow = mock_integrated_flow
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        
+        # Load initial configuration (only customer schema)
+        initial_config = {
+            "schema": {
+                "customer_records": integration_config["schema"]["customer_records"]
+            }
+        }
+        await processor.on_schema_config(initial_config, version=1)
+        
+        assert len(processor.schemas) == 1
+        assert "customer_records" in processor.schemas
+        assert "product_catalog" not in processor.schemas
+        
+        # Act - Reload with full configuration
+        await processor.on_schema_config(integration_config, version=2)
+        
+        # Assert
+        assert len(processor.schemas) == 2
+        assert "customer_records" in processor.schemas
+        assert "product_catalog" in processor.schemas
+
+    @pytest.mark.asyncio
+    async def test_error_resilience_integration(self, integration_config):
+        """Test service resilience to various error conditions"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
+        processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
+        
+        # Import and bind the convert_values_to_strings function
+        from trustgraph.extract.kg.objects.processor import convert_values_to_strings
+        processor.convert_values_to_strings = convert_values_to_strings
+        
+        # Mock flow with failing prompt service
+        failing_flow = MagicMock()
+        failing_prompt = AsyncMock()
+        failing_prompt.extract_rows.side_effect = Exception("Prompt service unavailable")
+        
+        def failing_context_router(service_name):
+            if service_name == "prompt-request":
+                return failing_prompt
+            elif service_name == "output":
+                return AsyncMock()
+            else:
+                return AsyncMock()
+        
+        failing_flow.side_effect = failing_context_router
+        processor.flow = failing_flow
+        
+        # Load configuration
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Create test chunk
+        metadata = Metadata(id="error-test", user="test", collection="test", metadata=[])
+        chunk = Chunk(metadata=metadata, chunk=b"Some text that will fail to process")
+        
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = chunk
+        
+        # Act & Assert - Should not raise exception
+        try:
+            await processor.on_chunk(mock_msg, None, failing_flow)
+            # Should complete without throwing exception
+        except Exception as e:
+            pytest.fail(f"Service should handle errors gracefully, but raised: {e}")
+
+    @pytest.mark.asyncio
+    async def test_metadata_propagation_integration(self, integration_config, mock_integrated_flow):
+        """Test proper metadata propagation through extraction pipeline"""
+        # Arrange - Create mock processor with actual methods
+        processor = MagicMock()
+        processor.schemas = {}
+        processor.config_key = "schema"
+        processor.flow = mock_integrated_flow
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_chunk = Processor.on_chunk.__get__(processor, Processor)
+        processor.extract_objects_for_schema = Processor.extract_objects_for_schema.__get__(processor, Processor)
+        
+        # Import and bind the convert_values_to_strings function
+        from trustgraph.extract.kg.objects.processor import convert_values_to_strings
+        processor.convert_values_to_strings = convert_values_to_strings
+        
+        # Load configuration
+        await processor.on_schema_config(integration_config, version=1)
+        
+        # Create chunk with rich metadata
+        original_metadata = Metadata(
+            id="metadata-test-chunk",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]  # Could include source document metadata
+        )
+        
+        chunk = Chunk(
+            metadata=original_metadata,
+            chunk=b"Customer: John Smith, ID: CUST001, email: john.smith@email.com"
+        )
+        
+        mock_msg = MagicMock()
+        mock_msg.value.return_value = chunk
+        
+        # Act
+        await processor.on_chunk(mock_msg, None, mock_integrated_flow)
+        
+        # Assert
+        output_producer = mock_integrated_flow("output")
+        
+        # Find extracted object
+        extracted_obj = None
+        for call in output_producer.send.call_args_list:
+            obj = call[0][0]
+            if obj.schema_name == "customer_records":
+                extracted_obj = obj
+                break
+        
+        assert extracted_obj is not None
+        
+        # Verify metadata propagation
+        assert extracted_obj.metadata.user == "test_user"
+        assert extracted_obj.metadata.collection == "test_collection"
+        assert "metadata-test-chunk" in extracted_obj.metadata.id  # Should include source reference
--- a/tests/integration/test_objects_cassandra_integration.py
+++ b/tests/integration/test_objects_cassandra_integration.py
@ -0,0 +1,384 @@
+"""
+Integration tests for Cassandra Object Storage
+
+These tests verify the end-to-end functionality of storing ExtractedObjects
+in Cassandra, including table creation, data insertion, and error handling.
+"""
+
+import pytest
+from unittest.mock import MagicMock, AsyncMock, patch
+import json
+import uuid
+
+from trustgraph.storage.objects.cassandra.write import Processor
+from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
+
+
+@pytest.mark.integration
+class TestObjectsCassandraIntegration:
+    """Integration tests for Cassandra object storage"""
+
+    @pytest.fixture
+    def mock_cassandra_session(self):
+        """Mock Cassandra session for integration tests"""
+        session = MagicMock()
+        session.execute = MagicMock()
+        return session
+
+    @pytest.fixture
+    def mock_cassandra_cluster(self, mock_cassandra_session):
+        """Mock Cassandra cluster"""
+        cluster = MagicMock()
+        cluster.connect.return_value = mock_cassandra_session
+        cluster.shutdown = MagicMock()
+        return cluster
+
+    @pytest.fixture
+    def processor_with_mocks(self, mock_cassandra_cluster, mock_cassandra_session):
+        """Create processor with mocked Cassandra dependencies"""
+        processor = MagicMock()
+        processor.graph_host = "localhost"
+        processor.graph_username = None
+        processor.graph_password = None
+        processor.config_key = "schema"
+        processor.schemas = {}
+        processor.known_keyspaces = set()
+        processor.known_tables = {}
+        processor.cluster = None
+        processor.session = None
+        
+        # Bind actual methods
+        processor.connect_cassandra = Processor.connect_cassandra.__get__(processor, Processor)
+        processor.ensure_keyspace = Processor.ensure_keyspace.__get__(processor, Processor)
+        processor.ensure_table = Processor.ensure_table.__get__(processor, Processor)
+        processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
+        processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
+        processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
+        processor.convert_value = Processor.convert_value.__get__(processor, Processor)
+        processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
+        processor.on_object = Processor.on_object.__get__(processor, Processor)
+        
+        return processor, mock_cassandra_cluster, mock_cassandra_session
+
+    @pytest.mark.asyncio
+    async def test_end_to_end_object_storage(self, processor_with_mocks):
+        """Test complete flow from schema config to object storage"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        # Mock Cluster creation
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            # Step 1: Configure schema
+            config = {
+                "schema": {
+                    "customer_records": json.dumps({
+                        "name": "customer_records",
+                        "description": "Customer information",
+                        "fields": [
+                            {"name": "customer_id", "type": "string", "primary_key": True},
+                            {"name": "name", "type": "string", "required": True},
+                            {"name": "email", "type": "string", "indexed": True},
+                            {"name": "age", "type": "integer"}
+                        ]
+                    })
+                }
+            }
+            
+            await processor.on_schema_config(config, version=1)
+            assert "customer_records" in processor.schemas
+            
+            # Step 2: Process an ExtractedObject
+            test_obj = ExtractedObject(
+                metadata=Metadata(
+                    id="doc-001",
+                    user="test_user",
+                    collection="import_2024",
+                    metadata=[]
+                ),
+                schema_name="customer_records",
+                values={
+                    "customer_id": "CUST001",
+                    "name": "John Doe",
+                    "email": "john@example.com",
+                    "age": "30"
+                },
+                confidence=0.95,
+                source_span="Customer: John Doe..."
+            )
+            
+            msg = MagicMock()
+            msg.value.return_value = test_obj
+            
+            await processor.on_object(msg, None, None)
+            
+            # Verify Cassandra interactions
+            assert mock_cluster.connect.called
+            
+            # Verify keyspace creation
+            keyspace_calls = [call for call in mock_session.execute.call_args_list 
+                            if "CREATE KEYSPACE" in str(call)]
+            assert len(keyspace_calls) == 1
+            assert "test_user" in str(keyspace_calls[0])
+            
+            # Verify table creation
+            table_calls = [call for call in mock_session.execute.call_args_list 
+                         if "CREATE TABLE" in str(call)]
+            assert len(table_calls) == 1
+            assert "o_customer_records" in str(table_calls[0])  # Table gets o_ prefix
+            assert "collection text" in str(table_calls[0])
+            assert "PRIMARY KEY ((collection, customer_id))" in str(table_calls[0])
+            
+            # Verify index creation
+            index_calls = [call for call in mock_session.execute.call_args_list 
+                         if "CREATE INDEX" in str(call)]
+            assert len(index_calls) == 1
+            assert "email" in str(index_calls[0])
+            
+            # Verify data insertion
+            insert_calls = [call for call in mock_session.execute.call_args_list 
+                          if "INSERT INTO" in str(call)]
+            assert len(insert_calls) == 1
+            insert_call = insert_calls[0]
+            assert "test_user.o_customer_records" in str(insert_call)  # Table gets o_ prefix
+            
+            # Check inserted values
+            values = insert_call[0][1]
+            assert "import_2024" in values  # collection
+            assert "CUST001" in values      # customer_id
+            assert "John Doe" in values     # name
+            assert "john@example.com" in values  # email
+            assert 30 in values             # age (converted to int)
+
+    @pytest.mark.asyncio
+    async def test_multi_schema_handling(self, processor_with_mocks):
+        """Test handling multiple schemas and objects"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            # Configure multiple schemas
+            config = {
+                "schema": {
+                    "products": json.dumps({
+                        "name": "products",
+                        "fields": [
+                            {"name": "product_id", "type": "string", "primary_key": True},
+                            {"name": "name", "type": "string"},
+                            {"name": "price", "type": "float"}
+                        ]
+                    }),
+                    "orders": json.dumps({
+                        "name": "orders",
+                        "fields": [
+                            {"name": "order_id", "type": "string", "primary_key": True},
+                            {"name": "customer_id", "type": "string"},
+                            {"name": "total", "type": "float"}
+                        ]
+                    })
+                }
+            }
+            
+            await processor.on_schema_config(config, version=1)
+            assert len(processor.schemas) == 2
+            
+            # Process objects for different schemas
+            product_obj = ExtractedObject(
+                metadata=Metadata(id="p1", user="shop", collection="catalog", metadata=[]),
+                schema_name="products",
+                values={"product_id": "P001", "name": "Widget", "price": "19.99"},
+                confidence=0.9,
+                source_span="Product..."
+            )
+            
+            order_obj = ExtractedObject(
+                metadata=Metadata(id="o1", user="shop", collection="sales", metadata=[]),
+                schema_name="orders",
+                values={"order_id": "O001", "customer_id": "C001", "total": "59.97"},
+                confidence=0.85,
+                source_span="Order..."
+            )
+            
+            # Process both objects
+            for obj in [product_obj, order_obj]:
+                msg = MagicMock()
+                msg.value.return_value = obj
+                await processor.on_object(msg, None, None)
+            
+            # Verify separate tables were created
+            table_calls = [call for call in mock_session.execute.call_args_list 
+                         if "CREATE TABLE" in str(call)]
+            assert len(table_calls) == 2
+            assert any("o_products" in str(call) for call in table_calls)  # Tables get o_ prefix
+            assert any("o_orders" in str(call) for call in table_calls)    # Tables get o_ prefix
+
+    @pytest.mark.asyncio
+    async def test_missing_required_fields(self, processor_with_mocks):
+        """Test handling of objects with missing required fields"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            # Configure schema with required field
+            processor.schemas["test_schema"] = RowSchema(
+                name="test_schema",
+                description="Test",
+                fields=[
+                    Field(name="id", type="string", size=50, primary=True, required=True),
+                    Field(name="required_field", type="string", size=100, required=True)
+                ]
+            )
+            
+            # Create object missing required field
+            test_obj = ExtractedObject(
+                metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
+                schema_name="test_schema",
+                values={"id": "123"},  # missing required_field
+                confidence=0.8,
+                source_span="Test"
+            )
+            
+            msg = MagicMock()
+            msg.value.return_value = test_obj
+            
+            # Should still process (Cassandra doesn't enforce NOT NULL)
+            await processor.on_object(msg, None, None)
+            
+            # Verify insert was attempted
+            insert_calls = [call for call in mock_session.execute.call_args_list 
+                          if "INSERT INTO" in str(call)]
+            assert len(insert_calls) == 1
+
+    @pytest.mark.asyncio
+    async def test_schema_without_primary_key(self, processor_with_mocks):
+        """Test handling schemas without defined primary keys"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            # Configure schema without primary key
+            processor.schemas["events"] = RowSchema(
+                name="events",
+                description="Event log",
+                fields=[
+                    Field(name="event_type", type="string", size=50),
+                    Field(name="timestamp", type="timestamp", size=0)
+                ]
+            )
+            
+            # Process object
+            test_obj = ExtractedObject(
+                metadata=Metadata(id="e1", user="logger", collection="app_events", metadata=[]),
+                schema_name="events",
+                values={"event_type": "login", "timestamp": "2024-01-01T10:00:00Z"},
+                confidence=1.0,
+                source_span="Event"
+            )
+            
+            msg = MagicMock()
+            msg.value.return_value = test_obj
+            
+            await processor.on_object(msg, None, None)
+            
+            # Verify synthetic_id was added
+            table_calls = [call for call in mock_session.execute.call_args_list 
+                         if "CREATE TABLE" in str(call)]
+            assert len(table_calls) == 1
+            assert "synthetic_id uuid" in str(table_calls[0])
+            
+            # Verify insert includes UUID
+            insert_calls = [call for call in mock_session.execute.call_args_list 
+                          if "INSERT INTO" in str(call)]
+            assert len(insert_calls) == 1
+            values = insert_calls[0][0][1]
+            # Check that a UUID was generated (will be in values list)
+            uuid_found = any(isinstance(v, uuid.UUID) for v in values)
+            assert uuid_found
+
+    @pytest.mark.asyncio
+    async def test_authentication_handling(self, processor_with_mocks):
+        """Test Cassandra authentication"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        processor.graph_username = "cassandra_user"
+        processor.graph_password = "cassandra_pass"
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster') as mock_cluster_class:
+            with patch('trustgraph.storage.objects.cassandra.write.PlainTextAuthProvider') as mock_auth:
+                mock_cluster_class.return_value = mock_cluster
+                
+                # Trigger connection
+                processor.connect_cassandra()
+                
+                # Verify authentication was configured
+                mock_auth.assert_called_once_with(
+                    username="cassandra_user",
+                    password="cassandra_pass"
+                )
+                mock_cluster_class.assert_called_once()
+                call_kwargs = mock_cluster_class.call_args[1]
+                assert 'auth_provider' in call_kwargs
+
+    @pytest.mark.asyncio 
+    async def test_error_handling_during_insert(self, processor_with_mocks):
+        """Test error handling when insertion fails"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            processor.schemas["test"] = RowSchema(
+                name="test",
+                fields=[Field(name="id", type="string", size=50, primary=True)]
+            )
+            
+            # Make insert fail
+            mock_session.execute.side_effect = [
+                None,  # keyspace creation succeeds
+                None,  # table creation succeeds
+                Exception("Connection timeout")  # insert fails
+            ]
+            
+            test_obj = ExtractedObject(
+                metadata=Metadata(id="t1", user="test", collection="test", metadata=[]),
+                schema_name="test",
+                values={"id": "123"},
+                confidence=0.9,
+                source_span="Test"
+            )
+            
+            msg = MagicMock()
+            msg.value.return_value = test_obj
+            
+            # Should raise the exception
+            with pytest.raises(Exception, match="Connection timeout"):
+                await processor.on_object(msg, None, None)
+
+    @pytest.mark.asyncio
+    async def test_collection_partitioning(self, processor_with_mocks):
+        """Test that objects are properly partitioned by collection"""
+        processor, mock_cluster, mock_session = processor_with_mocks
+        
+        with patch('trustgraph.storage.objects.cassandra.write.Cluster', return_value=mock_cluster):
+            processor.schemas["data"] = RowSchema(
+                name="data",
+                fields=[Field(name="id", type="string", size=50, primary=True)]
+            )
+            
+            # Process objects from different collections
+            collections = ["import_jan", "import_feb", "import_mar"]
+            
+            for coll in collections:
+                obj = ExtractedObject(
+                    metadata=Metadata(id=f"{coll}-1", user="analytics", collection=coll, metadata=[]),
+                    schema_name="data",
+                    values={"id": f"ID-{coll}"},
+                    confidence=0.9,
+                    source_span="Data"
+                )
+                
+                msg = MagicMock()
+                msg.value.return_value = obj
+                await processor.on_object(msg, None, None)
+            
+            # Verify all inserts include collection in values
+            insert_calls = [call for call in mock_session.execute.call_args_list 
+                          if "INSERT INTO" in str(call)]
+            assert len(insert_calls) == 3
+            
+            # Check each insert has the correct collection
+            for i, call in enumerate(insert_calls):
+                values = call[0][1]
+                assert collections[i] in values