mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-06 21:35:13 +02:00
Structured data 2 (#645)
* Structured data refactor - multi-index tables, remove need for manual mods to the Cassandra tables * Tech spec updated to track implementation
This commit is contained in:
parent
5ffad92345
commit
1809c1f56d
87 changed files with 5233 additions and 3235 deletions
|
|
@ -1,8 +1,8 @@
|
|||
"""
|
||||
Contract tests for Cassandra Object Storage
|
||||
Contract tests for Cassandra Row Storage
|
||||
|
||||
These tests verify the message contracts and schema compatibility
|
||||
for the objects storage processor.
|
||||
for the rows storage processor.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
|
@ -10,12 +10,12 @@ import json
|
|||
from pulsar.schema import AvroSchema
|
||||
|
||||
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
|
||||
from trustgraph.storage.objects.cassandra.write import Processor
|
||||
from trustgraph.storage.rows.cassandra.write import Processor
|
||||
|
||||
|
||||
@pytest.mark.contract
|
||||
class TestObjectsCassandraContracts:
|
||||
"""Contract tests for Cassandra object storage messages"""
|
||||
class TestRowsCassandraContracts:
|
||||
"""Contract tests for Cassandra row storage messages"""
|
||||
|
||||
def test_extracted_object_input_contract(self):
|
||||
"""Test that ExtractedObject schema matches expected input format"""
|
||||
|
|
@ -145,50 +145,6 @@ class TestObjectsCassandraContracts:
|
|||
assert required_field_keys.issubset(field.keys())
|
||||
assert set(field.keys()).issubset(required_field_keys | optional_field_keys)
|
||||
|
||||
def test_cassandra_type_mapping_contract(self):
|
||||
"""Test that all supported field types have Cassandra mappings"""
|
||||
processor = Processor.__new__(Processor)
|
||||
|
||||
# All field types that should be supported
|
||||
supported_types = [
|
||||
("string", "text"),
|
||||
("integer", "int"), # or bigint based on size
|
||||
("float", "float"), # or double based on size
|
||||
("boolean", "boolean"),
|
||||
("timestamp", "timestamp"),
|
||||
("date", "date"),
|
||||
("time", "time"),
|
||||
("uuid", "uuid")
|
||||
]
|
||||
|
||||
for field_type, expected_cassandra_type in supported_types:
|
||||
cassandra_type = processor.get_cassandra_type(field_type)
|
||||
# For integer and float, the exact type depends on size
|
||||
if field_type in ["integer", "float"]:
|
||||
assert cassandra_type in ["int", "bigint", "float", "double"]
|
||||
else:
|
||||
assert cassandra_type == expected_cassandra_type
|
||||
|
||||
def test_value_conversion_contract(self):
|
||||
"""Test value conversion for all supported types"""
|
||||
processor = Processor.__new__(Processor)
|
||||
|
||||
# Test conversions maintain data integrity
|
||||
test_cases = [
|
||||
# (input_value, field_type, expected_output, expected_type)
|
||||
("123", "integer", 123, int),
|
||||
("123.45", "float", 123.45, float),
|
||||
("true", "boolean", True, bool),
|
||||
("false", "boolean", False, bool),
|
||||
("test string", "string", "test string", str),
|
||||
(None, "string", None, type(None)),
|
||||
]
|
||||
|
||||
for input_val, field_type, expected_val, expected_type in test_cases:
|
||||
result = processor.convert_value(input_val, field_type)
|
||||
assert result == expected_val
|
||||
assert isinstance(result, expected_type) or result is None
|
||||
|
||||
@pytest.mark.skip(reason="ExtractedObject is a dataclass, not a Pulsar Record type")
|
||||
def test_extracted_object_serialization_contract(self):
|
||||
"""Test that ExtractedObject can be serialized/deserialized correctly"""
|
||||
|
|
@ -222,43 +178,31 @@ class TestObjectsCassandraContracts:
|
|||
assert decoded.confidence == original.confidence
|
||||
assert decoded.source_span == original.source_span
|
||||
|
||||
def test_cassandra_table_naming_contract(self):
|
||||
def test_cassandra_name_sanitization_contract(self):
|
||||
"""Test Cassandra naming conventions and constraints"""
|
||||
processor = Processor.__new__(Processor)
|
||||
|
||||
# Test table naming (always gets o_ prefix)
|
||||
table_test_names = [
|
||||
("simple_name", "o_simple_name"),
|
||||
("Name-With-Dashes", "o_name_with_dashes"),
|
||||
("name.with.dots", "o_name_with_dots"),
|
||||
("123_numbers", "o_123_numbers"),
|
||||
("special!@#chars", "o_special___chars"), # 3 special chars become 3 underscores
|
||||
("UPPERCASE", "o_uppercase"),
|
||||
("CamelCase", "o_camelcase"),
|
||||
("", "o_"), # Edge case - empty string becomes o_
|
||||
]
|
||||
|
||||
for input_name, expected_name in table_test_names:
|
||||
result = processor.sanitize_table(input_name)
|
||||
assert result == expected_name
|
||||
# Verify result is valid Cassandra identifier (starts with letter)
|
||||
assert result.startswith('o_')
|
||||
assert result.replace('o_', '').replace('_', '').isalnum() or result == 'o_'
|
||||
|
||||
# Test regular name sanitization (only adds o_ prefix if starts with number)
|
||||
|
||||
# Test name sanitization for Cassandra identifiers
|
||||
# - Non-alphanumeric chars (except underscore) become underscores
|
||||
# - Names starting with non-letter get 'r_' prefix
|
||||
# - All names converted to lowercase
|
||||
name_test_cases = [
|
||||
("simple_name", "simple_name"),
|
||||
("Name-With-Dashes", "name_with_dashes"),
|
||||
("name.with.dots", "name_with_dots"),
|
||||
("123_numbers", "o_123_numbers"), # Only this gets o_ prefix
|
||||
("123_numbers", "r_123_numbers"), # Gets r_ prefix (starts with number)
|
||||
("special!@#chars", "special___chars"), # 3 special chars become 3 underscores
|
||||
("UPPERCASE", "uppercase"),
|
||||
("CamelCase", "camelcase"),
|
||||
("_underscore_start", "r__underscore_start"), # Gets r_ prefix (starts with underscore)
|
||||
]
|
||||
|
||||
|
||||
for input_name, expected_name in name_test_cases:
|
||||
result = processor.sanitize_name(input_name)
|
||||
assert result == expected_name
|
||||
assert result == expected_name, f"Expected {expected_name} but got {result} for input {input_name}"
|
||||
# Verify result is valid Cassandra identifier (starts with letter)
|
||||
if result: # Skip empty string case
|
||||
assert result[0].isalpha(), f"Result {result} should start with a letter"
|
||||
|
||||
def test_primary_key_structure_contract(self):
|
||||
"""Test that primary key structure follows Cassandra best practices"""
|
||||
|
|
@ -308,8 +252,8 @@ class TestObjectsCassandraContracts:
|
|||
|
||||
|
||||
@pytest.mark.contract
|
||||
class TestObjectsCassandraContractsBatch:
|
||||
"""Contract tests for Cassandra object storage batch processing"""
|
||||
class TestRowsCassandraContractsBatch:
|
||||
"""Contract tests for Cassandra row storage batch processing"""
|
||||
|
||||
def test_extracted_object_batch_input_contract(self):
|
||||
"""Test that batched ExtractedObject schema matches expected input format"""
|
||||
|
|
@ -1,26 +1,26 @@
|
|||
"""
|
||||
Contract tests for Objects GraphQL Query Service
|
||||
Contract tests for Rows GraphQL Query Service
|
||||
|
||||
These tests verify the message contracts and schema compatibility
|
||||
for the objects GraphQL query processor.
|
||||
for the rows GraphQL query processor.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from pulsar.schema import AvroSchema
|
||||
|
||||
from trustgraph.schema import ObjectsQueryRequest, ObjectsQueryResponse, GraphQLError
|
||||
from trustgraph.query.objects.cassandra.service import Processor
|
||||
from trustgraph.schema import RowsQueryRequest, RowsQueryResponse, GraphQLError
|
||||
from trustgraph.query.rows.cassandra.service import Processor
|
||||
|
||||
|
||||
@pytest.mark.contract
|
||||
class TestObjectsGraphQLQueryContracts:
|
||||
class TestRowsGraphQLQueryContracts:
|
||||
"""Contract tests for GraphQL query service messages"""
|
||||
|
||||
def test_objects_query_request_contract(self):
|
||||
"""Test ObjectsQueryRequest schema structure and required fields"""
|
||||
def test_rows_query_request_contract(self):
|
||||
"""Test RowsQueryRequest schema structure and required fields"""
|
||||
# Create test request with all required fields
|
||||
test_request = ObjectsQueryRequest(
|
||||
test_request = RowsQueryRequest(
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
query='{ customers { id name email } }',
|
||||
|
|
@ -49,10 +49,10 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert test_request.variables["status"] == "active"
|
||||
assert test_request.operation_name == "GetCustomers"
|
||||
|
||||
def test_objects_query_request_minimal(self):
|
||||
"""Test ObjectsQueryRequest with minimal required fields"""
|
||||
def test_rows_query_request_minimal(self):
|
||||
"""Test RowsQueryRequest with minimal required fields"""
|
||||
# Create request with only essential fields
|
||||
minimal_request = ObjectsQueryRequest(
|
||||
minimal_request = RowsQueryRequest(
|
||||
user="user",
|
||||
collection="collection",
|
||||
query='{ test }',
|
||||
|
|
@ -91,10 +91,10 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert test_error.path == ["customers", "0", "nonexistent"]
|
||||
assert test_error.extensions["code"] == "FIELD_ERROR"
|
||||
|
||||
def test_objects_query_response_success_contract(self):
|
||||
"""Test ObjectsQueryResponse schema for successful queries"""
|
||||
def test_rows_query_response_success_contract(self):
|
||||
"""Test RowsQueryResponse schema for successful queries"""
|
||||
# Create successful response
|
||||
success_response = ObjectsQueryResponse(
|
||||
success_response = RowsQueryResponse(
|
||||
error=None,
|
||||
data='{"customers": [{"id": "1", "name": "John", "email": "john@example.com"}]}',
|
||||
errors=[],
|
||||
|
|
@ -119,11 +119,11 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert len(parsed_data["customers"]) == 1
|
||||
assert parsed_data["customers"][0]["id"] == "1"
|
||||
|
||||
def test_objects_query_response_error_contract(self):
|
||||
"""Test ObjectsQueryResponse schema for error cases"""
|
||||
def test_rows_query_response_error_contract(self):
|
||||
"""Test RowsQueryResponse schema for error cases"""
|
||||
# Create GraphQL errors - work around Pulsar Array(Record) validation bug
|
||||
# by creating a response without the problematic errors array first
|
||||
error_response = ObjectsQueryResponse(
|
||||
error_response = RowsQueryResponse(
|
||||
error=None, # System error is None - these are GraphQL errors
|
||||
data=None, # No data due to errors
|
||||
errors=[], # Empty errors array to avoid Pulsar bug
|
||||
|
|
@ -160,14 +160,14 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert validation_error.path == ["customers", "email"]
|
||||
assert validation_error.extensions["details"] == "Invalid email format"
|
||||
|
||||
def test_objects_query_response_system_error_contract(self):
|
||||
"""Test ObjectsQueryResponse schema for system errors"""
|
||||
def test_rows_query_response_system_error_contract(self):
|
||||
"""Test RowsQueryResponse schema for system errors"""
|
||||
from trustgraph.schema import Error
|
||||
|
||||
# Create system error response
|
||||
system_error_response = ObjectsQueryResponse(
|
||||
system_error_response = RowsQueryResponse(
|
||||
error=Error(
|
||||
type="objects-query-error",
|
||||
type="rows-query-error",
|
||||
message="Failed to connect to Cassandra cluster"
|
||||
),
|
||||
data=None,
|
||||
|
|
@ -177,7 +177,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
|
||||
# Verify system error structure
|
||||
assert system_error_response.error is not None
|
||||
assert system_error_response.error.type == "objects-query-error"
|
||||
assert system_error_response.error.type == "rows-query-error"
|
||||
assert "Cassandra" in system_error_response.error.message
|
||||
assert system_error_response.data is None
|
||||
assert len(system_error_response.errors) == 0
|
||||
|
|
@ -186,7 +186,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
def test_request_response_serialization_contract(self):
|
||||
"""Test that request/response can be serialized/deserialized correctly"""
|
||||
# Create original request
|
||||
original_request = ObjectsQueryRequest(
|
||||
original_request = RowsQueryRequest(
|
||||
user="serialization_test",
|
||||
collection="test_data",
|
||||
query='{ orders(limit: 5) { id total customer { name } } }',
|
||||
|
|
@ -195,7 +195,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
)
|
||||
|
||||
# Test request serialization using Pulsar schema
|
||||
request_schema = AvroSchema(ObjectsQueryRequest)
|
||||
request_schema = AvroSchema(RowsQueryRequest)
|
||||
|
||||
# Encode and decode request
|
||||
encoded_request = request_schema.encode(original_request)
|
||||
|
|
@ -209,7 +209,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert decoded_request.operation_name == original_request.operation_name
|
||||
|
||||
# Create original response - work around Pulsar Array(Record) bug
|
||||
original_response = ObjectsQueryResponse(
|
||||
original_response = RowsQueryResponse(
|
||||
error=None,
|
||||
data='{"orders": []}',
|
||||
errors=[], # Empty to avoid Pulsar validation bug
|
||||
|
|
@ -224,7 +224,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
)
|
||||
|
||||
# Test response serialization
|
||||
response_schema = AvroSchema(ObjectsQueryResponse)
|
||||
response_schema = AvroSchema(RowsQueryResponse)
|
||||
|
||||
# Encode and decode response
|
||||
encoded_response = response_schema.encode(original_response)
|
||||
|
|
@ -244,7 +244,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
def test_graphql_query_format_contract(self):
|
||||
"""Test supported GraphQL query formats"""
|
||||
# Test basic query
|
||||
basic_query = ObjectsQueryRequest(
|
||||
basic_query = RowsQueryRequest(
|
||||
user="test", collection="test", query='{ customers { id } }',
|
||||
variables={}, operation_name=""
|
||||
)
|
||||
|
|
@ -253,7 +253,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert basic_query.query.strip().endswith('}')
|
||||
|
||||
# Test query with variables
|
||||
parameterized_query = ObjectsQueryRequest(
|
||||
parameterized_query = RowsQueryRequest(
|
||||
user="test", collection="test",
|
||||
query='query GetCustomers($status: String, $limit: Int) { customers(status: $status, limit: $limit) { id name } }',
|
||||
variables={"status": "active", "limit": "10"},
|
||||
|
|
@ -265,7 +265,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert parameterized_query.operation_name == "GetCustomers"
|
||||
|
||||
# Test complex nested query
|
||||
nested_query = ObjectsQueryRequest(
|
||||
nested_query = RowsQueryRequest(
|
||||
user="test", collection="test",
|
||||
query='''
|
||||
{
|
||||
|
|
@ -296,7 +296,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
# Note: Current schema uses Map(String()) which only supports string values
|
||||
# This test verifies the current contract, though ideally we'd support all JSON types
|
||||
|
||||
variables_test = ObjectsQueryRequest(
|
||||
variables_test = RowsQueryRequest(
|
||||
user="test", collection="test", query='{ test }',
|
||||
variables={
|
||||
"string_var": "test_value",
|
||||
|
|
@ -319,7 +319,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
def test_cassandra_context_fields_contract(self):
|
||||
"""Test that request contains necessary fields for Cassandra operations"""
|
||||
# Verify request has fields needed for Cassandra keyspace/table targeting
|
||||
request = ObjectsQueryRequest(
|
||||
request = RowsQueryRequest(
|
||||
user="keyspace_name", # Maps to Cassandra keyspace
|
||||
collection="partition_collection", # Used in partition key
|
||||
query='{ objects { id } }',
|
||||
|
|
@ -338,7 +338,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
def test_graphql_extensions_contract(self):
|
||||
"""Test GraphQL extensions field format and usage"""
|
||||
# Extensions should support query metadata
|
||||
response_with_extensions = ObjectsQueryResponse(
|
||||
response_with_extensions = RowsQueryResponse(
|
||||
error=None,
|
||||
data='{"test": "data"}',
|
||||
errors=[],
|
||||
|
|
@ -404,7 +404,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
'''
|
||||
|
||||
# Request to execute specific operation
|
||||
multi_op_request = ObjectsQueryRequest(
|
||||
multi_op_request = RowsQueryRequest(
|
||||
user="test", collection="test",
|
||||
query=multi_op_query,
|
||||
variables={},
|
||||
|
|
@ -417,7 +417,7 @@ class TestObjectsGraphQLQueryContracts:
|
|||
assert "GetOrders" in multi_op_request.query
|
||||
|
||||
# Test single operation (operation_name optional)
|
||||
single_op_request = ObjectsQueryRequest(
|
||||
single_op_request = RowsQueryRequest(
|
||||
user="test", collection="test",
|
||||
query='{ customers { id } }',
|
||||
variables={}, operation_name=""
|
||||
Loading…
Add table
Add a link
Reference in a new issue