Merge 2.0 to master (#651)

2026-04-28 01:46:22 +02:00 · 2026-02-28 11:03:14 +00:00 · 2026-02-28 11:03:14 +00:00 · b9d7bf9a8b
commit b9d7bf9a8b
parent 3666ece2c5
212 changed files with 13940 additions and 6180 deletions
--- a/tests/contract/test_rows_cassandra_contracts.py
+++ b/tests/contract/test_rows_cassandra_contracts.py
@ -0,0 +1,463 @@
+"""
+Contract tests for Cassandra Row Storage
+
+These tests verify the message contracts and schema compatibility
+for the rows storage processor.
+"""
+
+import pytest
+import json
+from pulsar.schema import AvroSchema
+
+from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
+from trustgraph.storage.rows.cassandra.write import Processor
+
+
+@pytest.mark.contract
+class TestRowsCassandraContracts:
+    """Contract tests for Cassandra row storage messages"""
+
+    def test_extracted_object_input_contract(self):
+        """Test that ExtractedObject schema matches expected input format"""
+        # Create test object with all required fields
+        test_metadata = Metadata(
+            id="test-doc-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        test_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="customer_records",
+            values=[{
+                "customer_id": "CUST123",
+                "name": "Test Customer",
+                "email": "test@example.com"
+            }],
+            confidence=0.95,
+            source_span="Customer data from document..."
+        )
+        
+        # Verify all required fields are present
+        assert hasattr(test_object, 'metadata')
+        assert hasattr(test_object, 'schema_name')
+        assert hasattr(test_object, 'values')
+        assert hasattr(test_object, 'confidence')
+        assert hasattr(test_object, 'source_span')
+        
+        # Verify metadata structure
+        assert hasattr(test_object.metadata, 'id')
+        assert hasattr(test_object.metadata, 'user')
+        assert hasattr(test_object.metadata, 'collection')
+        assert hasattr(test_object.metadata, 'metadata')
+        
+        # Verify types
+        assert isinstance(test_object.schema_name, str)
+        assert isinstance(test_object.values, list)
+        assert isinstance(test_object.confidence, float)
+        assert isinstance(test_object.source_span, str)
+
+    def test_row_schema_structure_contract(self):
+        """Test RowSchema structure used for table definitions"""
+        # Create test schema
+        test_fields = [
+            Field(
+                name="id",
+                type="string",
+                size=50,
+                primary=True,
+                description="Primary key",
+                required=True,
+                enum_values=[],
+                indexed=False
+            ),
+            Field(
+                name="status",
+                type="string",
+                size=20,
+                primary=False,
+                description="Status field",
+                required=False,
+                enum_values=["active", "inactive", "pending"],
+                indexed=True
+            )
+        ]
+        
+        test_schema = RowSchema(
+            name="test_table",
+            description="Test table schema",
+            fields=test_fields
+        )
+        
+        # Verify schema structure
+        assert hasattr(test_schema, 'name')
+        assert hasattr(test_schema, 'description')
+        assert hasattr(test_schema, 'fields')
+        assert isinstance(test_schema.fields, list)
+        
+        # Verify field structure
+        for field in test_schema.fields:
+            assert hasattr(field, 'name')
+            assert hasattr(field, 'type')
+            assert hasattr(field, 'size')
+            assert hasattr(field, 'primary')
+            assert hasattr(field, 'description')
+            assert hasattr(field, 'required')
+            assert hasattr(field, 'enum_values')
+            assert hasattr(field, 'indexed')
+
+    def test_schema_config_format_contract(self):
+        """Test the expected configuration format for schemas"""
+        # Define expected config structure
+        config_format = {
+            "schema": {
+                "table_name": json.dumps({
+                    "name": "table_name",
+                    "description": "Table description",
+                    "fields": [
+                        {
+                            "name": "field_name",
+                            "type": "string",
+                            "size": 0,
+                            "primary_key": True,
+                            "description": "Field description",
+                            "required": True,
+                            "enum": [],
+                            "indexed": False
+                        }
+                    ]
+                })
+            }
+        }
+        
+        # Verify config can be parsed
+        schema_json = json.loads(config_format["schema"]["table_name"])
+        assert "name" in schema_json
+        assert "fields" in schema_json
+        assert isinstance(schema_json["fields"], list)
+        
+        # Verify field format
+        field = schema_json["fields"][0]
+        required_field_keys = {"name", "type"}
+        optional_field_keys = {"size", "primary_key", "description", "required", "enum", "indexed"}
+        
+        assert required_field_keys.issubset(field.keys())
+        assert set(field.keys()).issubset(required_field_keys | optional_field_keys)
+
+    @pytest.mark.skip(reason="ExtractedObject is a dataclass, not a Pulsar Record type")
+    def test_extracted_object_serialization_contract(self):
+        """Test that ExtractedObject can be serialized/deserialized correctly"""
+        # Create test object
+        original = ExtractedObject(
+            metadata=Metadata(
+                id="serial-001",
+                user="test_user",
+                collection="test_coll",
+                metadata=[]
+            ),
+            schema_name="test_schema",
+            values=[{"field1": "value1", "field2": "123"}],
+            confidence=0.85,
+            source_span="Test span"
+        )
+        
+        # Test serialization using schema
+        schema = AvroSchema(ExtractedObject)
+        
+        # Encode and decode
+        encoded = schema.encode(original)
+        decoded = schema.decode(encoded)
+        
+        # Verify round-trip
+        assert decoded.metadata.id == original.metadata.id
+        assert decoded.metadata.user == original.metadata.user
+        assert decoded.metadata.collection == original.metadata.collection
+        assert decoded.schema_name == original.schema_name
+        assert decoded.values == original.values
+        assert decoded.confidence == original.confidence
+        assert decoded.source_span == original.source_span
+
+    def test_cassandra_name_sanitization_contract(self):
+        """Test Cassandra naming conventions and constraints"""
+        processor = Processor.__new__(Processor)
+
+        # Test name sanitization for Cassandra identifiers
+        # - Non-alphanumeric chars (except underscore) become underscores
+        # - Names starting with non-letter get 'r_' prefix
+        # - All names converted to lowercase
+        name_test_cases = [
+            ("simple_name", "simple_name"),
+            ("Name-With-Dashes", "name_with_dashes"),
+            ("name.with.dots", "name_with_dots"),
+            ("123_numbers", "r_123_numbers"),  # Gets r_ prefix (starts with number)
+            ("special!@#chars", "special___chars"),  # 3 special chars become 3 underscores
+            ("UPPERCASE", "uppercase"),
+            ("CamelCase", "camelcase"),
+            ("_underscore_start", "r__underscore_start"),  # Gets r_ prefix (starts with underscore)
+        ]
+
+        for input_name, expected_name in name_test_cases:
+            result = processor.sanitize_name(input_name)
+            assert result == expected_name, f"Expected {expected_name} but got {result} for input {input_name}"
+            # Verify result is valid Cassandra identifier (starts with letter)
+            if result:  # Skip empty string case
+                assert result[0].isalpha(), f"Result {result} should start with a letter"
+
+    def test_primary_key_structure_contract(self):
+        """Test that primary key structure follows Cassandra best practices"""
+        # Verify partition key always includes collection
+        processor = Processor.__new__(Processor)
+        processor.schemas = {}
+        processor.known_keyspaces = set()
+        processor.known_tables = {}
+        processor.session = None
+        
+        # Test schema with primary key
+        schema_with_pk = RowSchema(
+            name="test",
+            fields=[
+                Field(name="id", type="string", primary=True),
+                Field(name="data", type="string")
+            ]
+        )
+        
+        # The primary key should be ((collection, id))
+        # This is verified in the implementation where collection
+        # is always first in the partition key
+
+    def test_metadata_field_usage_contract(self):
+        """Test that metadata fields are used correctly in storage"""
+        # Create test object
+        test_obj = ExtractedObject(
+            metadata=Metadata(
+                id="meta-001",
+                user="user123",  # -> keyspace
+                collection="coll456",  # -> partition key
+                metadata=[{"key": "value"}]
+            ),
+            schema_name="table789",  # -> table name
+            values=[{"field": "value"}],
+            confidence=0.9,
+            source_span="Source"
+        )
+        
+        # Verify mapping contract:
+        # - metadata.user -> Cassandra keyspace
+        # - schema_name -> Cassandra table
+        # - metadata.collection -> Part of primary key
+        assert test_obj.metadata.user  # Required for keyspace
+        assert test_obj.schema_name  # Required for table
+        assert test_obj.metadata.collection  # Required for partition key
+
+
+@pytest.mark.contract
+class TestRowsCassandraContractsBatch:
+    """Contract tests for Cassandra row storage batch processing"""
+
+    def test_extracted_object_batch_input_contract(self):
+        """Test that batched ExtractedObject schema matches expected input format"""
+        # Create test object with multiple values in batch
+        test_metadata = Metadata(
+            id="batch-doc-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        batch_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="customer_records",
+            values=[
+                {
+                    "customer_id": "CUST123",
+                    "name": "Test Customer 1",
+                    "email": "test1@example.com"
+                },
+                {
+                    "customer_id": "CUST124", 
+                    "name": "Test Customer 2",
+                    "email": "test2@example.com"
+                },
+                {
+                    "customer_id": "CUST125",
+                    "name": "Test Customer 3", 
+                    "email": "test3@example.com"
+                }
+            ],
+            confidence=0.88,
+            source_span="Multiple customer data from document..."
+        )
+        
+        # Verify batch structure
+        assert hasattr(batch_object, 'values')
+        assert isinstance(batch_object.values, list)
+        assert len(batch_object.values) == 3
+        
+        # Verify each batch item is a dict
+        for i, batch_item in enumerate(batch_object.values):
+            assert isinstance(batch_item, dict)
+            assert "customer_id" in batch_item
+            assert "name" in batch_item
+            assert "email" in batch_item
+            assert batch_item["customer_id"] == f"CUST12{3+i}"
+            assert f"Test Customer {i+1}" in batch_item["name"]
+
+    def test_extracted_object_empty_batch_contract(self):
+        """Test empty batch ExtractedObject contract"""
+        test_metadata = Metadata(
+            id="empty-batch-001",
+            user="test_user",
+            collection="test_collection", 
+            metadata=[]
+        )
+        
+        empty_batch_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="empty_schema",
+            values=[],  # Empty batch
+            confidence=1.0,
+            source_span="No objects found in document"
+        )
+        
+        # Verify empty batch structure
+        assert hasattr(empty_batch_object, 'values')
+        assert isinstance(empty_batch_object.values, list)
+        assert len(empty_batch_object.values) == 0
+        assert empty_batch_object.confidence == 1.0
+
+    def test_extracted_object_single_item_batch_contract(self):
+        """Test single-item batch (backward compatibility) contract"""
+        test_metadata = Metadata(
+            id="single-batch-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        single_batch_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="customer_records",
+            values=[{  # Array with single item for backward compatibility
+                "customer_id": "CUST999",
+                "name": "Single Customer",
+                "email": "single@example.com"
+            }],
+            confidence=0.95,
+            source_span="Single customer data from document..."
+        )
+        
+        # Verify single-item batch structure
+        assert isinstance(single_batch_object.values, list)
+        assert len(single_batch_object.values) == 1
+        assert isinstance(single_batch_object.values[0], dict)
+        assert single_batch_object.values[0]["customer_id"] == "CUST999"
+
+    @pytest.mark.skip(reason="ExtractedObject is a dataclass, not a Pulsar Record type")
+    def test_extracted_object_batch_serialization_contract(self):
+        """Test that batched ExtractedObject can be serialized/deserialized correctly"""
+        # Create batch object
+        original = ExtractedObject(
+            metadata=Metadata(
+                id="batch-serial-001",
+                user="test_user",
+                collection="test_coll",
+                metadata=[]
+            ),
+            schema_name="test_schema",
+            values=[
+                {"field1": "value1", "field2": "123"},
+                {"field1": "value2", "field2": "456"},  
+                {"field1": "value3", "field2": "789"}
+            ],
+            confidence=0.92,
+            source_span="Batch test span"
+        )
+        
+        # Test serialization using schema
+        schema = AvroSchema(ExtractedObject)
+        
+        # Encode and decode
+        encoded = schema.encode(original)
+        decoded = schema.decode(encoded)
+        
+        # Verify round-trip for batch
+        assert decoded.metadata.id == original.metadata.id
+        assert decoded.metadata.user == original.metadata.user
+        assert decoded.metadata.collection == original.metadata.collection
+        assert decoded.schema_name == original.schema_name
+        assert len(decoded.values) == len(original.values)
+        assert len(decoded.values) == 3
+        
+        # Verify each batch item
+        for i in range(3):
+            assert decoded.values[i] == original.values[i]
+            assert decoded.values[i]["field1"] == f"value{i+1}"
+            assert decoded.values[i]["field2"] == f"{123 + i*333}"
+            
+        assert decoded.confidence == original.confidence
+        assert decoded.source_span == original.source_span
+
+    def test_batch_processing_field_validation_contract(self):
+        """Test that batch processing validates field consistency"""
+        # All batch items should have consistent field structure
+        # This is a contract that the application should enforce
+        
+        # Valid batch - all items have same fields
+        valid_batch_values = [
+            {"id": "1", "name": "Item 1", "value": "100"},
+            {"id": "2", "name": "Item 2", "value": "200"},
+            {"id": "3", "name": "Item 3", "value": "300"}
+        ]
+        
+        # Each item has the same field structure
+        field_sets = [set(item.keys()) for item in valid_batch_values]
+        assert all(fields == field_sets[0] for fields in field_sets), "All batch items should have consistent fields"
+        
+        # Invalid batch - inconsistent fields (this would be caught by application logic)
+        invalid_batch_values = [
+            {"id": "1", "name": "Item 1", "value": "100"},
+            {"id": "2", "name": "Item 2"},  # Missing 'value' field
+            {"id": "3", "name": "Item 3", "value": "300", "extra": "field"}  # Extra field
+        ]
+        
+        # Demonstrate the inconsistency
+        invalid_field_sets = [set(item.keys()) for item in invalid_batch_values]
+        assert not all(fields == invalid_field_sets[0] for fields in invalid_field_sets), "Invalid batch should have inconsistent fields"
+
+    def test_batch_storage_partition_key_contract(self):
+        """Test that batch objects maintain partition key consistency"""
+        # In Cassandra storage, all objects in a batch should:
+        # 1. Belong to the same collection (partition key component)
+        # 2. Have unique primary keys within the batch
+        # 3. Be stored in the same keyspace (user)
+        
+        test_metadata = Metadata(
+            id="partition-test-001",
+            user="consistent_user",  # Same keyspace
+            collection="consistent_collection",  # Same partition
+            metadata=[]
+        )
+        
+        batch_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="partition_test",
+            values=[
+                {"id": "pk1", "data": "data1"},  # Unique primary key
+                {"id": "pk2", "data": "data2"},  # Unique primary key
+                {"id": "pk3", "data": "data3"}   # Unique primary key
+            ],
+            confidence=0.95,
+            source_span="Partition consistency test"
+        )
+        
+        # Verify consistency contract
+        assert batch_object.metadata.user  # Must have user for keyspace
+        assert batch_object.metadata.collection  # Must have collection for partition key
+        
+        # Verify unique primary keys in batch
+        primary_keys = [item["id"] for item in batch_object.values]
+        assert len(primary_keys) == len(set(primary_keys)), "Primary keys must be unique within batch"
+        
+        # All batch items will be stored in same keyspace and partition
+        # This is enforced by the metadata.user and metadata.collection being shared