""" Contract tests for Structured Data Pulsar Message Schemas These tests verify the contracts for all structured data Pulsar message schemas, ensuring schema compatibility, serialization contracts, and service interface stability. Following the TEST_STRATEGY.md approach for contract testing. """ import pytest import json from typing import Dict, Any from trustgraph.schema import ( StructuredDataSubmission, ExtractedObject, QuestionToStructuredQueryRequest, QuestionToStructuredQueryResponse, StructuredQueryRequest, StructuredQueryResponse, StructuredObjectEmbedding, Field, RowSchema, Metadata, Error ) from .conftest import serialize_deserialize_test @pytest.mark.contract class TestStructuredDataSchemaContracts: """Contract tests for structured data schemas""" def test_field_schema_contract(self): """Test enhanced Field schema contract""" # Arrange & Act - create Field instance directly field = Field( name="customer_id", type="string", size=0, primary=True, description="Unique customer identifier", required=True, enum_values=[], indexed=True ) # Assert - test field properties assert field.name == "customer_id" assert field.type == "string" assert field.primary is True assert field.indexed is True assert isinstance(field.enum_values, list) assert len(field.enum_values) == 0 # Test with enum values field_with_enum = Field( name="status", type="string", size=0, primary=False, description="Status field", required=False, enum_values=["active", "inactive"], indexed=True ) assert len(field_with_enum.enum_values) == 2 assert "active" in field_with_enum.enum_values def test_row_schema_contract(self): """Test RowSchema contract""" # Arrange & Act field = Field( name="email", type="string", size=255, primary=False, description="Customer email", required=True, enum_values=[], indexed=True ) schema = RowSchema( name="customers", description="Customer records schema", fields=[field] ) # Assert assert schema.name == "customers" assert schema.description == "Customer records schema" assert len(schema.fields) == 1 assert schema.fields[0].name == "email" assert schema.fields[0].indexed is True def test_structured_data_submission_contract(self): """Test StructuredDataSubmission schema contract""" # Arrange metadata = Metadata( id="structured-data-001", user="test_user", collection="test_collection", metadata=[] ) # Act submission = StructuredDataSubmission( metadata=metadata, format="csv", schema_name="customer_records", data=b"id,name,email\n1,John,john@example.com", options={"delimiter": ",", "header": "true"} ) # Assert assert submission.format == "csv" assert submission.schema_name == "customer_records" assert submission.options["delimiter"] == "," assert submission.metadata.id == "structured-data-001" assert len(submission.data) > 0 def test_extracted_object_contract(self): """Test ExtractedObject schema contract""" # Arrange metadata = Metadata( id="extracted-obj-001", user="test_user", collection="test_collection", metadata=[] ) # Act obj = ExtractedObject( metadata=metadata, schema_name="customer_records", values=[{"id": "123", "name": "John Doe", "email": "john@example.com"}], confidence=0.95, source_span="John Doe (john@example.com) customer ID 123" ) # Assert assert obj.schema_name == "customer_records" assert obj.values[0]["name"] == "John Doe" assert obj.confidence == 0.95 assert len(obj.source_span) > 0 assert obj.metadata.id == "extracted-obj-001" def test_extracted_object_batch_contract(self): """Test ExtractedObject schema contract for batched values""" # Arrange metadata = Metadata( id="extracted-batch-001", user="test_user", collection="test_collection", metadata=[] ) # Act - create object with multiple values obj = ExtractedObject( metadata=metadata, schema_name="customer_records", values=[ {"id": "123", "name": "John Doe", "email": "john@example.com"}, {"id": "124", "name": "Jane Smith", "email": "jane@example.com"}, {"id": "125", "name": "Bob Johnson", "email": "bob@example.com"} ], confidence=0.85, source_span="Multiple customers found in document" ) # Assert assert obj.schema_name == "customer_records" assert len(obj.values) == 3 assert obj.values[0]["name"] == "John Doe" assert obj.values[1]["name"] == "Jane Smith" assert obj.values[2]["name"] == "Bob Johnson" assert obj.values[0]["id"] == "123" assert obj.values[1]["id"] == "124" assert obj.values[2]["id"] == "125" assert obj.confidence == 0.85 assert "Multiple customers" in obj.source_span def test_extracted_object_empty_batch_contract(self): """Test ExtractedObject schema contract for empty values array""" # Arrange metadata = Metadata( id="extracted-empty-001", user="test_user", collection="test_collection", metadata=[] ) # Act - create object with empty values array obj = ExtractedObject( metadata=metadata, schema_name="empty_schema", values=[], confidence=1.0, source_span="No objects found" ) # Assert assert obj.schema_name == "empty_schema" assert len(obj.values) == 0 assert obj.confidence == 1.0 @pytest.mark.contract class TestStructuredQueryServiceContracts: """Contract tests for structured query services""" def test_nlp_to_structured_query_request_contract(self): """Test QuestionToStructuredQueryRequest schema contract""" # Act request = QuestionToStructuredQueryRequest( question="Show me all customers who registered last month", max_results=100 ) # Assert assert "customers" in request.question assert request.max_results == 100 def test_nlp_to_structured_query_response_contract(self): """Test QuestionToStructuredQueryResponse schema contract""" # Act response = QuestionToStructuredQueryResponse( error=None, graphql_query="query { customers(filter: {registered: {gte: \"2024-01-01\"}}) { id name email } }", variables={"start_date": "2024-01-01"}, detected_schemas=["customers"], confidence=0.92 ) # Assert assert response.error is None assert "customers" in response.graphql_query assert response.detected_schemas[0] == "customers" assert response.confidence > 0.9 def test_structured_query_request_contract(self): """Test StructuredQueryRequest schema contract""" # Act request = StructuredQueryRequest( question="Show me customers with limit 10" ) # Assert assert "customers" in request.question def test_structured_query_response_contract(self): """Test StructuredQueryResponse schema contract""" # Act response = StructuredQueryResponse( error=None, data='{"customers": [{"id": "1", "name": "John", "email": "john@example.com"}]}', errors=[] ) # Assert assert response.error is None assert "customers" in response.data assert len(response.errors) == 0 def test_structured_query_response_with_errors_contract(self): """Test StructuredQueryResponse with GraphQL errors contract""" # Act response = StructuredQueryResponse( error=None, data=None, errors=["Field 'invalid_field' not found in schema 'customers'"] ) # Assert assert response.data is None assert len(response.errors) == 1 assert "invalid_field" in response.errors[0] @pytest.mark.contract class TestStructuredEmbeddingsContracts: """Contract tests for structured object embeddings""" def test_structured_object_embedding_contract(self): """Test StructuredObjectEmbedding schema contract""" # Arrange metadata = Metadata( id="struct-embed-001", user="test_user", collection="test_collection", metadata=[] ) # Act embedding = StructuredObjectEmbedding( metadata=metadata, vectors=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], schema_name="customer_records", object_id="customer_123", field_embeddings={ "name": [0.1, 0.2, 0.3], "email": [0.4, 0.5, 0.6] } ) # Assert assert embedding.schema_name == "customer_records" assert embedding.object_id == "customer_123" assert len(embedding.vectors) == 2 assert len(embedding.field_embeddings) == 2 assert "name" in embedding.field_embeddings @pytest.mark.contract class TestStructuredDataSerializationContracts: """Contract tests for structured data serialization/deserialization""" def test_structured_data_submission_serialization(self): """Test StructuredDataSubmission serialization contract""" # Arrange metadata = Metadata(id="test", user="user", collection="col", metadata=[]) submission_data = { "metadata": metadata, "format": "json", "schema_name": "test_schema", "data": b'{"test": "data"}', "options": {"encoding": "utf-8"} } # Act & Assert assert serialize_deserialize_test(StructuredDataSubmission, submission_data) def test_extracted_object_serialization(self): """Test ExtractedObject serialization contract""" # Arrange metadata = Metadata(id="test", user="user", collection="col", metadata=[]) object_data = { "metadata": metadata, "schema_name": "test_schema", "values": [{"field1": "value1"}], "confidence": 0.8, "source_span": "test span" } # Act & Assert assert serialize_deserialize_test(ExtractedObject, object_data) def test_nlp_query_serialization(self): """Test NLP query request/response serialization contract""" # Test request request_data = { "question": "test query", "max_results": 10 } assert serialize_deserialize_test(QuestionToStructuredQueryRequest, request_data) # Test response response_data = { "error": None, "graphql_query": "query { test }", "variables": {}, "detected_schemas": ["test"], "confidence": 0.9 } assert serialize_deserialize_test(QuestionToStructuredQueryResponse, response_data) def test_structured_query_serialization(self): """Test structured query request/response serialization contract""" # Test request request_data = { "question": "Show me all customers" } assert serialize_deserialize_test(StructuredQueryRequest, request_data) # Test response response_data = { "error": None, "data": '{"customers": [{"id": "1", "name": "John"}]}', "errors": [] } assert serialize_deserialize_test(StructuredQueryResponse, response_data) def test_extracted_object_batch_serialization(self): """Test ExtractedObject batch serialization contract""" # Arrange metadata = Metadata(id="test", user="user", collection="col", metadata=[]) batch_object_data = { "metadata": metadata, "schema_name": "test_schema", "values": [ {"field1": "value1", "field2": "value2"}, {"field1": "value3", "field2": "value4"}, {"field1": "value5", "field2": "value6"} ], "confidence": 0.9, "source_span": "batch test span" } # Act & Assert assert serialize_deserialize_test(ExtractedObject, batch_object_data) def test_extracted_object_empty_batch_serialization(self): """Test ExtractedObject empty batch serialization contract""" # Arrange metadata = Metadata(id="test", user="user", collection="col", metadata=[]) empty_batch_data = { "metadata": metadata, "schema_name": "test_schema", "values": [], "confidence": 1.0, "source_span": "empty batch" } # Act & Assert assert serialize_deserialize_test(ExtractedObject, empty_batch_data)