mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
parent
ebca467ed8
commit
0b7620bc04
12 changed files with 946 additions and 107 deletions
|
|
@ -30,11 +30,11 @@ class TestObjectsCassandraContracts:
|
|||
test_object = ExtractedObject(
|
||||
metadata=test_metadata,
|
||||
schema_name="customer_records",
|
||||
values={
|
||||
values=[{
|
||||
"customer_id": "CUST123",
|
||||
"name": "Test Customer",
|
||||
"email": "test@example.com"
|
||||
},
|
||||
}],
|
||||
confidence=0.95,
|
||||
source_span="Customer data from document..."
|
||||
)
|
||||
|
|
@ -54,7 +54,7 @@ class TestObjectsCassandraContracts:
|
|||
|
||||
# Verify types
|
||||
assert isinstance(test_object.schema_name, str)
|
||||
assert isinstance(test_object.values, dict)
|
||||
assert isinstance(test_object.values, list)
|
||||
assert isinstance(test_object.confidence, float)
|
||||
assert isinstance(test_object.source_span, str)
|
||||
|
||||
|
|
@ -200,7 +200,7 @@ class TestObjectsCassandraContracts:
|
|||
metadata=[]
|
||||
),
|
||||
schema_name="test_schema",
|
||||
values={"field1": "value1", "field2": "123"},
|
||||
values=[{"field1": "value1", "field2": "123"}],
|
||||
confidence=0.85,
|
||||
source_span="Test span"
|
||||
)
|
||||
|
|
@ -292,7 +292,7 @@ class TestObjectsCassandraContracts:
|
|||
metadata=[{"key": "value"}]
|
||||
),
|
||||
schema_name="table789", # -> table name
|
||||
values={"field": "value"},
|
||||
values=[{"field": "value"}],
|
||||
confidence=0.9,
|
||||
source_span="Source"
|
||||
)
|
||||
|
|
@ -303,4 +303,215 @@ class TestObjectsCassandraContracts:
|
|||
# - metadata.collection -> Part of primary key
|
||||
assert test_obj.metadata.user # Required for keyspace
|
||||
assert test_obj.schema_name # Required for table
|
||||
assert test_obj.metadata.collection # Required for partition key
|
||||
assert test_obj.metadata.collection # Required for partition key
|
||||
|
||||
|
||||
@pytest.mark.contract
|
||||
class TestObjectsCassandraContractsBatch:
|
||||
"""Contract tests for Cassandra object storage batch processing"""
|
||||
|
||||
def test_extracted_object_batch_input_contract(self):
|
||||
"""Test that batched ExtractedObject schema matches expected input format"""
|
||||
# Create test object with multiple values in batch
|
||||
test_metadata = Metadata(
|
||||
id="batch-doc-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
batch_object = ExtractedObject(
|
||||
metadata=test_metadata,
|
||||
schema_name="customer_records",
|
||||
values=[
|
||||
{
|
||||
"customer_id": "CUST123",
|
||||
"name": "Test Customer 1",
|
||||
"email": "test1@example.com"
|
||||
},
|
||||
{
|
||||
"customer_id": "CUST124",
|
||||
"name": "Test Customer 2",
|
||||
"email": "test2@example.com"
|
||||
},
|
||||
{
|
||||
"customer_id": "CUST125",
|
||||
"name": "Test Customer 3",
|
||||
"email": "test3@example.com"
|
||||
}
|
||||
],
|
||||
confidence=0.88,
|
||||
source_span="Multiple customer data from document..."
|
||||
)
|
||||
|
||||
# Verify batch structure
|
||||
assert hasattr(batch_object, 'values')
|
||||
assert isinstance(batch_object.values, list)
|
||||
assert len(batch_object.values) == 3
|
||||
|
||||
# Verify each batch item is a dict
|
||||
for i, batch_item in enumerate(batch_object.values):
|
||||
assert isinstance(batch_item, dict)
|
||||
assert "customer_id" in batch_item
|
||||
assert "name" in batch_item
|
||||
assert "email" in batch_item
|
||||
assert batch_item["customer_id"] == f"CUST12{3+i}"
|
||||
assert f"Test Customer {i+1}" in batch_item["name"]
|
||||
|
||||
def test_extracted_object_empty_batch_contract(self):
|
||||
"""Test empty batch ExtractedObject contract"""
|
||||
test_metadata = Metadata(
|
||||
id="empty-batch-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
empty_batch_object = ExtractedObject(
|
||||
metadata=test_metadata,
|
||||
schema_name="empty_schema",
|
||||
values=[], # Empty batch
|
||||
confidence=1.0,
|
||||
source_span="No objects found in document"
|
||||
)
|
||||
|
||||
# Verify empty batch structure
|
||||
assert hasattr(empty_batch_object, 'values')
|
||||
assert isinstance(empty_batch_object.values, list)
|
||||
assert len(empty_batch_object.values) == 0
|
||||
assert empty_batch_object.confidence == 1.0
|
||||
|
||||
def test_extracted_object_single_item_batch_contract(self):
|
||||
"""Test single-item batch (backward compatibility) contract"""
|
||||
test_metadata = Metadata(
|
||||
id="single-batch-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
single_batch_object = ExtractedObject(
|
||||
metadata=test_metadata,
|
||||
schema_name="customer_records",
|
||||
values=[{ # Array with single item for backward compatibility
|
||||
"customer_id": "CUST999",
|
||||
"name": "Single Customer",
|
||||
"email": "single@example.com"
|
||||
}],
|
||||
confidence=0.95,
|
||||
source_span="Single customer data from document..."
|
||||
)
|
||||
|
||||
# Verify single-item batch structure
|
||||
assert isinstance(single_batch_object.values, list)
|
||||
assert len(single_batch_object.values) == 1
|
||||
assert isinstance(single_batch_object.values[0], dict)
|
||||
assert single_batch_object.values[0]["customer_id"] == "CUST999"
|
||||
|
||||
def test_extracted_object_batch_serialization_contract(self):
|
||||
"""Test that batched ExtractedObject can be serialized/deserialized correctly"""
|
||||
# Create batch object
|
||||
original = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="batch-serial-001",
|
||||
user="test_user",
|
||||
collection="test_coll",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="test_schema",
|
||||
values=[
|
||||
{"field1": "value1", "field2": "123"},
|
||||
{"field1": "value2", "field2": "456"},
|
||||
{"field1": "value3", "field2": "789"}
|
||||
],
|
||||
confidence=0.92,
|
||||
source_span="Batch test span"
|
||||
)
|
||||
|
||||
# Test serialization using schema
|
||||
schema = AvroSchema(ExtractedObject)
|
||||
|
||||
# Encode and decode
|
||||
encoded = schema.encode(original)
|
||||
decoded = schema.decode(encoded)
|
||||
|
||||
# Verify round-trip for batch
|
||||
assert decoded.metadata.id == original.metadata.id
|
||||
assert decoded.metadata.user == original.metadata.user
|
||||
assert decoded.metadata.collection == original.metadata.collection
|
||||
assert decoded.schema_name == original.schema_name
|
||||
assert len(decoded.values) == len(original.values)
|
||||
assert len(decoded.values) == 3
|
||||
|
||||
# Verify each batch item
|
||||
for i in range(3):
|
||||
assert decoded.values[i] == original.values[i]
|
||||
assert decoded.values[i]["field1"] == f"value{i+1}"
|
||||
assert decoded.values[i]["field2"] == f"{123 + i*333}"
|
||||
|
||||
assert decoded.confidence == original.confidence
|
||||
assert decoded.source_span == original.source_span
|
||||
|
||||
def test_batch_processing_field_validation_contract(self):
|
||||
"""Test that batch processing validates field consistency"""
|
||||
# All batch items should have consistent field structure
|
||||
# This is a contract that the application should enforce
|
||||
|
||||
# Valid batch - all items have same fields
|
||||
valid_batch_values = [
|
||||
{"id": "1", "name": "Item 1", "value": "100"},
|
||||
{"id": "2", "name": "Item 2", "value": "200"},
|
||||
{"id": "3", "name": "Item 3", "value": "300"}
|
||||
]
|
||||
|
||||
# Each item has the same field structure
|
||||
field_sets = [set(item.keys()) for item in valid_batch_values]
|
||||
assert all(fields == field_sets[0] for fields in field_sets), "All batch items should have consistent fields"
|
||||
|
||||
# Invalid batch - inconsistent fields (this would be caught by application logic)
|
||||
invalid_batch_values = [
|
||||
{"id": "1", "name": "Item 1", "value": "100"},
|
||||
{"id": "2", "name": "Item 2"}, # Missing 'value' field
|
||||
{"id": "3", "name": "Item 3", "value": "300", "extra": "field"} # Extra field
|
||||
]
|
||||
|
||||
# Demonstrate the inconsistency
|
||||
invalid_field_sets = [set(item.keys()) for item in invalid_batch_values]
|
||||
assert not all(fields == invalid_field_sets[0] for fields in invalid_field_sets), "Invalid batch should have inconsistent fields"
|
||||
|
||||
def test_batch_storage_partition_key_contract(self):
|
||||
"""Test that batch objects maintain partition key consistency"""
|
||||
# In Cassandra storage, all objects in a batch should:
|
||||
# 1. Belong to the same collection (partition key component)
|
||||
# 2. Have unique primary keys within the batch
|
||||
# 3. Be stored in the same keyspace (user)
|
||||
|
||||
test_metadata = Metadata(
|
||||
id="partition-test-001",
|
||||
user="consistent_user", # Same keyspace
|
||||
collection="consistent_collection", # Same partition
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
batch_object = ExtractedObject(
|
||||
metadata=test_metadata,
|
||||
schema_name="partition_test",
|
||||
values=[
|
||||
{"id": "pk1", "data": "data1"}, # Unique primary key
|
||||
{"id": "pk2", "data": "data2"}, # Unique primary key
|
||||
{"id": "pk3", "data": "data3"} # Unique primary key
|
||||
],
|
||||
confidence=0.95,
|
||||
source_span="Partition consistency test"
|
||||
)
|
||||
|
||||
# Verify consistency contract
|
||||
assert batch_object.metadata.user # Must have user for keyspace
|
||||
assert batch_object.metadata.collection # Must have collection for partition key
|
||||
|
||||
# Verify unique primary keys in batch
|
||||
primary_keys = [item["id"] for item in batch_object.values]
|
||||
assert len(primary_keys) == len(set(primary_keys)), "Primary keys must be unique within batch"
|
||||
|
||||
# All batch items will be stored in same keyspace and partition
|
||||
# This is enforced by the metadata.user and metadata.collection being shared
|
||||
Loading…
Add table
Add a link
Reference in a new issue