Structured data 2 (#645)

* Structured data refactor - multi-index tables, remove need for manual mods to the Cassandra tables

* Tech spec updated to track implementation
This commit is contained in:
cybermaggedon 2026-02-23 15:56:29 +00:00 committed by GitHub
parent 5ffad92345
commit 1809c1f56d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
87 changed files with 5233 additions and 3235 deletions

View file

@ -1,8 +1,8 @@
"""
Contract tests for Cassandra Object Storage
Contract tests for Cassandra Row Storage
These tests verify the message contracts and schema compatibility
for the objects storage processor.
for the rows storage processor.
"""
import pytest
@ -10,12 +10,12 @@ import json
from pulsar.schema import AvroSchema
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
from trustgraph.storage.objects.cassandra.write import Processor
from trustgraph.storage.rows.cassandra.write import Processor
@pytest.mark.contract
class TestObjectsCassandraContracts:
"""Contract tests for Cassandra object storage messages"""
class TestRowsCassandraContracts:
"""Contract tests for Cassandra row storage messages"""
def test_extracted_object_input_contract(self):
"""Test that ExtractedObject schema matches expected input format"""
@ -145,50 +145,6 @@ class TestObjectsCassandraContracts:
assert required_field_keys.issubset(field.keys())
assert set(field.keys()).issubset(required_field_keys | optional_field_keys)
def test_cassandra_type_mapping_contract(self):
"""Test that all supported field types have Cassandra mappings"""
processor = Processor.__new__(Processor)
# All field types that should be supported
supported_types = [
("string", "text"),
("integer", "int"), # or bigint based on size
("float", "float"), # or double based on size
("boolean", "boolean"),
("timestamp", "timestamp"),
("date", "date"),
("time", "time"),
("uuid", "uuid")
]
for field_type, expected_cassandra_type in supported_types:
cassandra_type = processor.get_cassandra_type(field_type)
# For integer and float, the exact type depends on size
if field_type in ["integer", "float"]:
assert cassandra_type in ["int", "bigint", "float", "double"]
else:
assert cassandra_type == expected_cassandra_type
def test_value_conversion_contract(self):
"""Test value conversion for all supported types"""
processor = Processor.__new__(Processor)
# Test conversions maintain data integrity
test_cases = [
# (input_value, field_type, expected_output, expected_type)
("123", "integer", 123, int),
("123.45", "float", 123.45, float),
("true", "boolean", True, bool),
("false", "boolean", False, bool),
("test string", "string", "test string", str),
(None, "string", None, type(None)),
]
for input_val, field_type, expected_val, expected_type in test_cases:
result = processor.convert_value(input_val, field_type)
assert result == expected_val
assert isinstance(result, expected_type) or result is None
@pytest.mark.skip(reason="ExtractedObject is a dataclass, not a Pulsar Record type")
def test_extracted_object_serialization_contract(self):
"""Test that ExtractedObject can be serialized/deserialized correctly"""
@ -222,43 +178,31 @@ class TestObjectsCassandraContracts:
assert decoded.confidence == original.confidence
assert decoded.source_span == original.source_span
def test_cassandra_table_naming_contract(self):
def test_cassandra_name_sanitization_contract(self):
"""Test Cassandra naming conventions and constraints"""
processor = Processor.__new__(Processor)
# Test table naming (always gets o_ prefix)
table_test_names = [
("simple_name", "o_simple_name"),
("Name-With-Dashes", "o_name_with_dashes"),
("name.with.dots", "o_name_with_dots"),
("123_numbers", "o_123_numbers"),
("special!@#chars", "o_special___chars"), # 3 special chars become 3 underscores
("UPPERCASE", "o_uppercase"),
("CamelCase", "o_camelcase"),
("", "o_"), # Edge case - empty string becomes o_
]
for input_name, expected_name in table_test_names:
result = processor.sanitize_table(input_name)
assert result == expected_name
# Verify result is valid Cassandra identifier (starts with letter)
assert result.startswith('o_')
assert result.replace('o_', '').replace('_', '').isalnum() or result == 'o_'
# Test regular name sanitization (only adds o_ prefix if starts with number)
# Test name sanitization for Cassandra identifiers
# - Non-alphanumeric chars (except underscore) become underscores
# - Names starting with non-letter get 'r_' prefix
# - All names converted to lowercase
name_test_cases = [
("simple_name", "simple_name"),
("Name-With-Dashes", "name_with_dashes"),
("name.with.dots", "name_with_dots"),
("123_numbers", "o_123_numbers"), # Only this gets o_ prefix
("123_numbers", "r_123_numbers"), # Gets r_ prefix (starts with number)
("special!@#chars", "special___chars"), # 3 special chars become 3 underscores
("UPPERCASE", "uppercase"),
("CamelCase", "camelcase"),
("_underscore_start", "r__underscore_start"), # Gets r_ prefix (starts with underscore)
]
for input_name, expected_name in name_test_cases:
result = processor.sanitize_name(input_name)
assert result == expected_name
assert result == expected_name, f"Expected {expected_name} but got {result} for input {input_name}"
# Verify result is valid Cassandra identifier (starts with letter)
if result: # Skip empty string case
assert result[0].isalpha(), f"Result {result} should start with a letter"
def test_primary_key_structure_contract(self):
"""Test that primary key structure follows Cassandra best practices"""
@ -308,8 +252,8 @@ class TestObjectsCassandraContracts:
@pytest.mark.contract
class TestObjectsCassandraContractsBatch:
"""Contract tests for Cassandra object storage batch processing"""
class TestRowsCassandraContractsBatch:
"""Contract tests for Cassandra row storage batch processing"""
def test_extracted_object_batch_input_contract(self):
"""Test that batched ExtractedObject schema matches expected input format"""

View file

@ -1,26 +1,26 @@
"""
Contract tests for Objects GraphQL Query Service
Contract tests for Rows GraphQL Query Service
These tests verify the message contracts and schema compatibility
for the objects GraphQL query processor.
for the rows GraphQL query processor.
"""
import pytest
import json
from pulsar.schema import AvroSchema
from trustgraph.schema import ObjectsQueryRequest, ObjectsQueryResponse, GraphQLError
from trustgraph.query.objects.cassandra.service import Processor
from trustgraph.schema import RowsQueryRequest, RowsQueryResponse, GraphQLError
from trustgraph.query.rows.cassandra.service import Processor
@pytest.mark.contract
class TestObjectsGraphQLQueryContracts:
class TestRowsGraphQLQueryContracts:
"""Contract tests for GraphQL query service messages"""
def test_objects_query_request_contract(self):
"""Test ObjectsQueryRequest schema structure and required fields"""
def test_rows_query_request_contract(self):
"""Test RowsQueryRequest schema structure and required fields"""
# Create test request with all required fields
test_request = ObjectsQueryRequest(
test_request = RowsQueryRequest(
user="test_user",
collection="test_collection",
query='{ customers { id name email } }',
@ -49,10 +49,10 @@ class TestObjectsGraphQLQueryContracts:
assert test_request.variables["status"] == "active"
assert test_request.operation_name == "GetCustomers"
def test_objects_query_request_minimal(self):
"""Test ObjectsQueryRequest with minimal required fields"""
def test_rows_query_request_minimal(self):
"""Test RowsQueryRequest with minimal required fields"""
# Create request with only essential fields
minimal_request = ObjectsQueryRequest(
minimal_request = RowsQueryRequest(
user="user",
collection="collection",
query='{ test }',
@ -91,10 +91,10 @@ class TestObjectsGraphQLQueryContracts:
assert test_error.path == ["customers", "0", "nonexistent"]
assert test_error.extensions["code"] == "FIELD_ERROR"
def test_objects_query_response_success_contract(self):
"""Test ObjectsQueryResponse schema for successful queries"""
def test_rows_query_response_success_contract(self):
"""Test RowsQueryResponse schema for successful queries"""
# Create successful response
success_response = ObjectsQueryResponse(
success_response = RowsQueryResponse(
error=None,
data='{"customers": [{"id": "1", "name": "John", "email": "john@example.com"}]}',
errors=[],
@ -119,11 +119,11 @@ class TestObjectsGraphQLQueryContracts:
assert len(parsed_data["customers"]) == 1
assert parsed_data["customers"][0]["id"] == "1"
def test_objects_query_response_error_contract(self):
"""Test ObjectsQueryResponse schema for error cases"""
def test_rows_query_response_error_contract(self):
"""Test RowsQueryResponse schema for error cases"""
# Create GraphQL errors - work around Pulsar Array(Record) validation bug
# by creating a response without the problematic errors array first
error_response = ObjectsQueryResponse(
error_response = RowsQueryResponse(
error=None, # System error is None - these are GraphQL errors
data=None, # No data due to errors
errors=[], # Empty errors array to avoid Pulsar bug
@ -160,14 +160,14 @@ class TestObjectsGraphQLQueryContracts:
assert validation_error.path == ["customers", "email"]
assert validation_error.extensions["details"] == "Invalid email format"
def test_objects_query_response_system_error_contract(self):
"""Test ObjectsQueryResponse schema for system errors"""
def test_rows_query_response_system_error_contract(self):
"""Test RowsQueryResponse schema for system errors"""
from trustgraph.schema import Error
# Create system error response
system_error_response = ObjectsQueryResponse(
system_error_response = RowsQueryResponse(
error=Error(
type="objects-query-error",
type="rows-query-error",
message="Failed to connect to Cassandra cluster"
),
data=None,
@ -177,7 +177,7 @@ class TestObjectsGraphQLQueryContracts:
# Verify system error structure
assert system_error_response.error is not None
assert system_error_response.error.type == "objects-query-error"
assert system_error_response.error.type == "rows-query-error"
assert "Cassandra" in system_error_response.error.message
assert system_error_response.data is None
assert len(system_error_response.errors) == 0
@ -186,7 +186,7 @@ class TestObjectsGraphQLQueryContracts:
def test_request_response_serialization_contract(self):
"""Test that request/response can be serialized/deserialized correctly"""
# Create original request
original_request = ObjectsQueryRequest(
original_request = RowsQueryRequest(
user="serialization_test",
collection="test_data",
query='{ orders(limit: 5) { id total customer { name } } }',
@ -195,7 +195,7 @@ class TestObjectsGraphQLQueryContracts:
)
# Test request serialization using Pulsar schema
request_schema = AvroSchema(ObjectsQueryRequest)
request_schema = AvroSchema(RowsQueryRequest)
# Encode and decode request
encoded_request = request_schema.encode(original_request)
@ -209,7 +209,7 @@ class TestObjectsGraphQLQueryContracts:
assert decoded_request.operation_name == original_request.operation_name
# Create original response - work around Pulsar Array(Record) bug
original_response = ObjectsQueryResponse(
original_response = RowsQueryResponse(
error=None,
data='{"orders": []}',
errors=[], # Empty to avoid Pulsar validation bug
@ -224,7 +224,7 @@ class TestObjectsGraphQLQueryContracts:
)
# Test response serialization
response_schema = AvroSchema(ObjectsQueryResponse)
response_schema = AvroSchema(RowsQueryResponse)
# Encode and decode response
encoded_response = response_schema.encode(original_response)
@ -244,7 +244,7 @@ class TestObjectsGraphQLQueryContracts:
def test_graphql_query_format_contract(self):
"""Test supported GraphQL query formats"""
# Test basic query
basic_query = ObjectsQueryRequest(
basic_query = RowsQueryRequest(
user="test", collection="test", query='{ customers { id } }',
variables={}, operation_name=""
)
@ -253,7 +253,7 @@ class TestObjectsGraphQLQueryContracts:
assert basic_query.query.strip().endswith('}')
# Test query with variables
parameterized_query = ObjectsQueryRequest(
parameterized_query = RowsQueryRequest(
user="test", collection="test",
query='query GetCustomers($status: String, $limit: Int) { customers(status: $status, limit: $limit) { id name } }',
variables={"status": "active", "limit": "10"},
@ -265,7 +265,7 @@ class TestObjectsGraphQLQueryContracts:
assert parameterized_query.operation_name == "GetCustomers"
# Test complex nested query
nested_query = ObjectsQueryRequest(
nested_query = RowsQueryRequest(
user="test", collection="test",
query='''
{
@ -296,7 +296,7 @@ class TestObjectsGraphQLQueryContracts:
# Note: Current schema uses Map(String()) which only supports string values
# This test verifies the current contract, though ideally we'd support all JSON types
variables_test = ObjectsQueryRequest(
variables_test = RowsQueryRequest(
user="test", collection="test", query='{ test }',
variables={
"string_var": "test_value",
@ -319,7 +319,7 @@ class TestObjectsGraphQLQueryContracts:
def test_cassandra_context_fields_contract(self):
"""Test that request contains necessary fields for Cassandra operations"""
# Verify request has fields needed for Cassandra keyspace/table targeting
request = ObjectsQueryRequest(
request = RowsQueryRequest(
user="keyspace_name", # Maps to Cassandra keyspace
collection="partition_collection", # Used in partition key
query='{ objects { id } }',
@ -338,7 +338,7 @@ class TestObjectsGraphQLQueryContracts:
def test_graphql_extensions_contract(self):
"""Test GraphQL extensions field format and usage"""
# Extensions should support query metadata
response_with_extensions = ObjectsQueryResponse(
response_with_extensions = RowsQueryResponse(
error=None,
data='{"test": "data"}',
errors=[],
@ -404,7 +404,7 @@ class TestObjectsGraphQLQueryContracts:
'''
# Request to execute specific operation
multi_op_request = ObjectsQueryRequest(
multi_op_request = RowsQueryRequest(
user="test", collection="test",
query=multi_op_query,
variables={},
@ -417,7 +417,7 @@ class TestObjectsGraphQLQueryContracts:
assert "GetOrders" in multi_op_request.query
# Test single operation (operation_name optional)
single_op_request = ObjectsQueryRequest(
single_op_request = RowsQueryRequest(
user="test", collection="test",
query='{ customers { id } }',
variables={}, operation_name=""