Structure data mvp (#452)

* Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist
2026-05-03 20:32:38 +02:00 · 2025-08-07 20:47:20 +01:00 · 2025-08-07 20:47:20 +01:00 · 83f0c1e7f3
commit 83f0c1e7f3
parent 5de56c5dbc
46 changed files with 5313 additions and 1629 deletions
--- a/tests/contract/test_message_contracts.py
+++ b/tests/contract/test_message_contracts.py
@ -18,7 +18,11 @@ from trustgraph.schema import (
    Chunk, Triple, Triples, Value, Error,
    EntityContext, EntityContexts,
    GraphEmbeddings, EntityEmbeddings,
-    Metadata
+    Metadata, Field, RowSchema,
+    StructuredDataSubmission, ExtractedObject,
+    NLPToStructuredQueryRequest, NLPToStructuredQueryResponse,
+    StructuredQueryRequest, StructuredQueryResponse,
+    StructuredObjectEmbedding
 )
 from .conftest import validate_schema_contract, serialize_deserialize_test

--- a/tests/contract/test_objects_cassandra_contracts.py
+++ b/tests/contract/test_objects_cassandra_contracts.py
@ -0,0 +1,306 @@
+"""
+Contract tests for Cassandra Object Storage
+
+These tests verify the message contracts and schema compatibility
+for the objects storage processor.
+"""
+
+import pytest
+import json
+from pulsar.schema import AvroSchema
+
+from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
+from trustgraph.storage.objects.cassandra.write import Processor
+
+
+@pytest.mark.contract
+class TestObjectsCassandraContracts:
+    """Contract tests for Cassandra object storage messages"""
+
+    def test_extracted_object_input_contract(self):
+        """Test that ExtractedObject schema matches expected input format"""
+        # Create test object with all required fields
+        test_metadata = Metadata(
+            id="test-doc-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        test_object = ExtractedObject(
+            metadata=test_metadata,
+            schema_name="customer_records",
+            values={
+                "customer_id": "CUST123",
+                "name": "Test Customer",
+                "email": "test@example.com"
+            },
+            confidence=0.95,
+            source_span="Customer data from document..."
+        )
+        
+        # Verify all required fields are present
+        assert hasattr(test_object, 'metadata')
+        assert hasattr(test_object, 'schema_name')
+        assert hasattr(test_object, 'values')
+        assert hasattr(test_object, 'confidence')
+        assert hasattr(test_object, 'source_span')
+        
+        # Verify metadata structure
+        assert hasattr(test_object.metadata, 'id')
+        assert hasattr(test_object.metadata, 'user')
+        assert hasattr(test_object.metadata, 'collection')
+        assert hasattr(test_object.metadata, 'metadata')
+        
+        # Verify types
+        assert isinstance(test_object.schema_name, str)
+        assert isinstance(test_object.values, dict)
+        assert isinstance(test_object.confidence, float)
+        assert isinstance(test_object.source_span, str)
+
+    def test_row_schema_structure_contract(self):
+        """Test RowSchema structure used for table definitions"""
+        # Create test schema
+        test_fields = [
+            Field(
+                name="id",
+                type="string",
+                size=50,
+                primary=True,
+                description="Primary key",
+                required=True,
+                enum_values=[],
+                indexed=False
+            ),
+            Field(
+                name="status",
+                type="string",
+                size=20,
+                primary=False,
+                description="Status field",
+                required=False,
+                enum_values=["active", "inactive", "pending"],
+                indexed=True
+            )
+        ]
+        
+        test_schema = RowSchema(
+            name="test_table",
+            description="Test table schema",
+            fields=test_fields
+        )
+        
+        # Verify schema structure
+        assert hasattr(test_schema, 'name')
+        assert hasattr(test_schema, 'description')
+        assert hasattr(test_schema, 'fields')
+        assert isinstance(test_schema.fields, list)
+        
+        # Verify field structure
+        for field in test_schema.fields:
+            assert hasattr(field, 'name')
+            assert hasattr(field, 'type')
+            assert hasattr(field, 'size')
+            assert hasattr(field, 'primary')
+            assert hasattr(field, 'description')
+            assert hasattr(field, 'required')
+            assert hasattr(field, 'enum_values')
+            assert hasattr(field, 'indexed')
+
+    def test_schema_config_format_contract(self):
+        """Test the expected configuration format for schemas"""
+        # Define expected config structure
+        config_format = {
+            "schema": {
+                "table_name": json.dumps({
+                    "name": "table_name",
+                    "description": "Table description",
+                    "fields": [
+                        {
+                            "name": "field_name",
+                            "type": "string",
+                            "size": 0,
+                            "primary_key": True,
+                            "description": "Field description",
+                            "required": True,
+                            "enum": [],
+                            "indexed": False
+                        }
+                    ]
+                })
+            }
+        }
+        
+        # Verify config can be parsed
+        schema_json = json.loads(config_format["schema"]["table_name"])
+        assert "name" in schema_json
+        assert "fields" in schema_json
+        assert isinstance(schema_json["fields"], list)
+        
+        # Verify field format
+        field = schema_json["fields"][0]
+        required_field_keys = {"name", "type"}
+        optional_field_keys = {"size", "primary_key", "description", "required", "enum", "indexed"}
+        
+        assert required_field_keys.issubset(field.keys())
+        assert set(field.keys()).issubset(required_field_keys | optional_field_keys)
+
+    def test_cassandra_type_mapping_contract(self):
+        """Test that all supported field types have Cassandra mappings"""
+        processor = Processor.__new__(Processor)
+        
+        # All field types that should be supported
+        supported_types = [
+            ("string", "text"),
+            ("integer", "int"),  # or bigint based on size
+            ("float", "float"),  # or double based on size
+            ("boolean", "boolean"),
+            ("timestamp", "timestamp"),
+            ("date", "date"),
+            ("time", "time"),
+            ("uuid", "uuid")
+        ]
+        
+        for field_type, expected_cassandra_type in supported_types:
+            cassandra_type = processor.get_cassandra_type(field_type)
+            # For integer and float, the exact type depends on size
+            if field_type in ["integer", "float"]:
+                assert cassandra_type in ["int", "bigint", "float", "double"]
+            else:
+                assert cassandra_type == expected_cassandra_type
+
+    def test_value_conversion_contract(self):
+        """Test value conversion for all supported types"""
+        processor = Processor.__new__(Processor)
+        
+        # Test conversions maintain data integrity
+        test_cases = [
+            # (input_value, field_type, expected_output, expected_type)
+            ("123", "integer", 123, int),
+            ("123.45", "float", 123.45, float),
+            ("true", "boolean", True, bool),
+            ("false", "boolean", False, bool),
+            ("test string", "string", "test string", str),
+            (None, "string", None, type(None)),
+        ]
+        
+        for input_val, field_type, expected_val, expected_type in test_cases:
+            result = processor.convert_value(input_val, field_type)
+            assert result == expected_val
+            assert isinstance(result, expected_type) or result is None
+
+    def test_extracted_object_serialization_contract(self):
+        """Test that ExtractedObject can be serialized/deserialized correctly"""
+        # Create test object
+        original = ExtractedObject(
+            metadata=Metadata(
+                id="serial-001",
+                user="test_user",
+                collection="test_coll",
+                metadata=[]
+            ),
+            schema_name="test_schema",
+            values={"field1": "value1", "field2": "123"},
+            confidence=0.85,
+            source_span="Test span"
+        )
+        
+        # Test serialization using schema
+        schema = AvroSchema(ExtractedObject)
+        
+        # Encode and decode
+        encoded = schema.encode(original)
+        decoded = schema.decode(encoded)
+        
+        # Verify round-trip
+        assert decoded.metadata.id == original.metadata.id
+        assert decoded.metadata.user == original.metadata.user
+        assert decoded.metadata.collection == original.metadata.collection
+        assert decoded.schema_name == original.schema_name
+        assert decoded.values == original.values
+        assert decoded.confidence == original.confidence
+        assert decoded.source_span == original.source_span
+
+    def test_cassandra_table_naming_contract(self):
+        """Test Cassandra naming conventions and constraints"""
+        processor = Processor.__new__(Processor)
+        
+        # Test table naming (always gets o_ prefix)
+        table_test_names = [
+            ("simple_name", "o_simple_name"),
+            ("Name-With-Dashes", "o_name_with_dashes"),
+            ("name.with.dots", "o_name_with_dots"),
+            ("123_numbers", "o_123_numbers"),
+            ("special!@#chars", "o_special___chars"),  # 3 special chars become 3 underscores
+            ("UPPERCASE", "o_uppercase"),
+            ("CamelCase", "o_camelcase"),
+            ("", "o_"),  # Edge case - empty string becomes o_
+        ]
+        
+        for input_name, expected_name in table_test_names:
+            result = processor.sanitize_table(input_name)
+            assert result == expected_name
+            # Verify result is valid Cassandra identifier (starts with letter)
+            assert result.startswith('o_')
+            assert result.replace('o_', '').replace('_', '').isalnum() or result == 'o_'
+        
+        # Test regular name sanitization (only adds o_ prefix if starts with number)
+        name_test_cases = [
+            ("simple_name", "simple_name"),
+            ("Name-With-Dashes", "name_with_dashes"),
+            ("name.with.dots", "name_with_dots"),
+            ("123_numbers", "o_123_numbers"),  # Only this gets o_ prefix
+            ("special!@#chars", "special___chars"),  # 3 special chars become 3 underscores
+            ("UPPERCASE", "uppercase"),
+            ("CamelCase", "camelcase"),
+        ]
+        
+        for input_name, expected_name in name_test_cases:
+            result = processor.sanitize_name(input_name)
+            assert result == expected_name
+
+    def test_primary_key_structure_contract(self):
+        """Test that primary key structure follows Cassandra best practices"""
+        # Verify partition key always includes collection
+        processor = Processor.__new__(Processor)
+        processor.schemas = {}
+        processor.known_keyspaces = set()
+        processor.known_tables = {}
+        processor.session = None
+        
+        # Test schema with primary key
+        schema_with_pk = RowSchema(
+            name="test",
+            fields=[
+                Field(name="id", type="string", primary=True),
+                Field(name="data", type="string")
+            ]
+        )
+        
+        # The primary key should be ((collection, id))
+        # This is verified in the implementation where collection
+        # is always first in the partition key
+
+    def test_metadata_field_usage_contract(self):
+        """Test that metadata fields are used correctly in storage"""
+        # Create test object
+        test_obj = ExtractedObject(
+            metadata=Metadata(
+                id="meta-001",
+                user="user123",  # -> keyspace
+                collection="coll456",  # -> partition key
+                metadata=[{"key": "value"}]
+            ),
+            schema_name="table789",  # -> table name
+            values={"field": "value"},
+            confidence=0.9,
+            source_span="Source"
+        )
+        
+        # Verify mapping contract:
+        # - metadata.user -> Cassandra keyspace
+        # - schema_name -> Cassandra table
+        # - metadata.collection -> Part of primary key
+        assert test_obj.metadata.user  # Required for keyspace
+        assert test_obj.schema_name  # Required for table
+        assert test_obj.metadata.collection  # Required for partition key
--- a/tests/contract/test_structured_data_contracts.py
+++ b/tests/contract/test_structured_data_contracts.py
@ -0,0 +1,308 @@
+"""
+Contract tests for Structured Data Pulsar Message Schemas
+
+These tests verify the contracts for all structured data Pulsar message schemas,
+ensuring schema compatibility, serialization contracts, and service interface stability.
+Following the TEST_STRATEGY.md approach for contract testing.
+"""
+
+import pytest
+import json
+from typing import Dict, Any
+
+from trustgraph.schema import (
+    StructuredDataSubmission, ExtractedObject,
+    NLPToStructuredQueryRequest, NLPToStructuredQueryResponse,
+    StructuredQueryRequest, StructuredQueryResponse,
+    StructuredObjectEmbedding, Field, RowSchema,
+    Metadata, Error, Value
+)
+from .conftest import serialize_deserialize_test
+
+
+@pytest.mark.contract
+class TestStructuredDataSchemaContracts:
+    """Contract tests for structured data schemas"""
+
+    def test_field_schema_contract(self):
+        """Test enhanced Field schema contract"""
+        # Arrange & Act - create Field instance directly
+        field = Field(
+            name="customer_id",
+            type="string",
+            size=0,
+            primary=True,
+            description="Unique customer identifier",
+            required=True,
+            enum_values=[],
+            indexed=True
+        )
+
+        # Assert - test field properties
+        assert field.name == "customer_id"
+        assert field.type == "string"
+        assert field.primary is True
+        assert field.indexed is True
+        assert isinstance(field.enum_values, list)
+        assert len(field.enum_values) == 0
+        
+        # Test with enum values
+        field_with_enum = Field(
+            name="status",
+            type="string",
+            size=0,
+            primary=False,
+            description="Status field",
+            required=False,
+            enum_values=["active", "inactive"],
+            indexed=True
+        )
+        
+        assert len(field_with_enum.enum_values) == 2
+        assert "active" in field_with_enum.enum_values
+
+    def test_row_schema_contract(self):
+        """Test RowSchema contract"""
+        # Arrange & Act
+        field = Field(
+            name="email",
+            type="string",
+            size=255,
+            primary=False,
+            description="Customer email",
+            required=True,
+            enum_values=[],
+            indexed=True
+        )
+        
+        schema = RowSchema(
+            name="customers",
+            description="Customer records schema",
+            fields=[field]
+        )
+
+        # Assert
+        assert schema.name == "customers"
+        assert schema.description == "Customer records schema"
+        assert len(schema.fields) == 1
+        assert schema.fields[0].name == "email"
+        assert schema.fields[0].indexed is True
+
+    def test_structured_data_submission_contract(self):
+        """Test StructuredDataSubmission schema contract"""
+        # Arrange
+        metadata = Metadata(
+            id="structured-data-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        # Act
+        submission = StructuredDataSubmission(
+            metadata=metadata,
+            format="csv",
+            schema_name="customer_records",
+            data=b"id,name,email\n1,John,john@example.com",
+            options={"delimiter": ",", "header": "true"}
+        )
+
+        # Assert
+        assert submission.format == "csv"
+        assert submission.schema_name == "customer_records"
+        assert submission.options["delimiter"] == ","
+        assert submission.metadata.id == "structured-data-001"
+        assert len(submission.data) > 0
+
+    def test_extracted_object_contract(self):
+        """Test ExtractedObject schema contract"""
+        # Arrange
+        metadata = Metadata(
+            id="extracted-obj-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        # Act
+        obj = ExtractedObject(
+            metadata=metadata,
+            schema_name="customer_records",
+            values={"id": "123", "name": "John Doe", "email": "john@example.com"},
+            confidence=0.95,
+            source_span="John Doe (john@example.com) customer ID 123"
+        )
+
+        # Assert
+        assert obj.schema_name == "customer_records"
+        assert obj.values["name"] == "John Doe"
+        assert obj.confidence == 0.95
+        assert len(obj.source_span) > 0
+        assert obj.metadata.id == "extracted-obj-001"
+
+
+@pytest.mark.contract
+class TestStructuredQueryServiceContracts:
+    """Contract tests for structured query services"""
+
+    def test_nlp_to_structured_query_request_contract(self):
+        """Test NLPToStructuredQueryRequest schema contract"""
+        # Act
+        request = NLPToStructuredQueryRequest(
+            natural_language_query="Show me all customers who registered last month",
+            max_results=100,
+            context_hints={"time_range": "last_month", "entity_type": "customer"}
+        )
+
+        # Assert
+        assert "customers" in request.natural_language_query
+        assert request.max_results == 100
+        assert request.context_hints["time_range"] == "last_month"
+
+    def test_nlp_to_structured_query_response_contract(self):
+        """Test NLPToStructuredQueryResponse schema contract"""
+        # Act
+        response = NLPToStructuredQueryResponse(
+            error=None,
+            graphql_query="query { customers(filter: {registered: {gte: \"2024-01-01\"}}) { id name email } }",
+            variables={"start_date": "2024-01-01"},
+            detected_schemas=["customers"],
+            confidence=0.92
+        )
+
+        # Assert
+        assert response.error is None
+        assert "customers" in response.graphql_query
+        assert response.detected_schemas[0] == "customers"
+        assert response.confidence > 0.9
+
+    def test_structured_query_request_contract(self):
+        """Test StructuredQueryRequest schema contract"""
+        # Act
+        request = StructuredQueryRequest(
+            query="query GetCustomers($limit: Int) { customers(limit: $limit) { id name email } }",
+            variables={"limit": "10"},
+            operation_name="GetCustomers"
+        )
+
+        # Assert
+        assert "customers" in request.query
+        assert request.variables["limit"] == "10"
+        assert request.operation_name == "GetCustomers"
+
+    def test_structured_query_response_contract(self):
+        """Test StructuredQueryResponse schema contract"""
+        # Act
+        response = StructuredQueryResponse(
+            error=None,
+            data='{"customers": [{"id": "1", "name": "John", "email": "john@example.com"}]}',
+            errors=[]
+        )
+
+        # Assert
+        assert response.error is None
+        assert "customers" in response.data
+        assert len(response.errors) == 0
+
+    def test_structured_query_response_with_errors_contract(self):
+        """Test StructuredQueryResponse with GraphQL errors contract"""
+        # Act
+        response = StructuredQueryResponse(
+            error=None,
+            data=None,
+            errors=["Field 'invalid_field' not found in schema 'customers'"]
+        )
+
+        # Assert
+        assert response.data is None
+        assert len(response.errors) == 1
+        assert "invalid_field" in response.errors[0]
+
+
+@pytest.mark.contract
+class TestStructuredEmbeddingsContracts:
+    """Contract tests for structured object embeddings"""
+
+    def test_structured_object_embedding_contract(self):
+        """Test StructuredObjectEmbedding schema contract"""
+        # Arrange
+        metadata = Metadata(
+            id="struct-embed-001",
+            user="test_user",
+            collection="test_collection",
+            metadata=[]
+        )
+        
+        # Act
+        embedding = StructuredObjectEmbedding(
+            metadata=metadata,
+            vectors=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
+            schema_name="customer_records",
+            object_id="customer_123",
+            field_embeddings={
+                "name": [0.1, 0.2, 0.3],
+                "email": [0.4, 0.5, 0.6]
+            }
+        )
+
+        # Assert
+        assert embedding.schema_name == "customer_records"
+        assert embedding.object_id == "customer_123"
+        assert len(embedding.vectors) == 2
+        assert len(embedding.field_embeddings) == 2
+        assert "name" in embedding.field_embeddings
+
+
+@pytest.mark.contract
+class TestStructuredDataSerializationContracts:
+    """Contract tests for structured data serialization/deserialization"""
+
+    def test_structured_data_submission_serialization(self):
+        """Test StructuredDataSubmission serialization contract"""
+        # Arrange
+        metadata = Metadata(id="test", user="user", collection="col", metadata=[])
+        submission_data = {
+            "metadata": metadata,
+            "format": "json",
+            "schema_name": "test_schema",
+            "data": b'{"test": "data"}',
+            "options": {"encoding": "utf-8"}
+        }
+
+        # Act & Assert
+        assert serialize_deserialize_test(StructuredDataSubmission, submission_data)
+
+    def test_extracted_object_serialization(self):
+        """Test ExtractedObject serialization contract"""
+        # Arrange
+        metadata = Metadata(id="test", user="user", collection="col", metadata=[])
+        object_data = {
+            "metadata": metadata,
+            "schema_name": "test_schema",
+            "values": {"field1": "value1"},
+            "confidence": 0.8,
+            "source_span": "test span"
+        }
+
+        # Act & Assert
+        assert serialize_deserialize_test(ExtractedObject, object_data)
+
+    def test_nlp_query_serialization(self):
+        """Test NLP query request/response serialization contract"""
+        # Test request
+        request_data = {
+            "natural_language_query": "test query",
+            "max_results": 10,
+            "context_hints": {}
+        }
+        assert serialize_deserialize_test(NLPToStructuredQueryRequest, request_data)
+
+        # Test response
+        response_data = {
+            "error": None,
+            "graphql_query": "query { test }",
+            "variables": {},
+            "detected_schemas": ["test"],
+            "confidence": 0.9
+        }
+        assert serialize_deserialize_test(NLPToStructuredQueryResponse, response_data)