Object batching (#499)

* Object batching

* Update tests
This commit is contained in:
cybermaggedon 2025-09-05 15:59:06 +01:00 committed by GitHub
parent ebca467ed8
commit 0b7620bc04
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 946 additions and 107 deletions

View file

@ -30,11 +30,11 @@ class TestObjectsCassandraContracts:
test_object = ExtractedObject(
metadata=test_metadata,
schema_name="customer_records",
values={
values=[{
"customer_id": "CUST123",
"name": "Test Customer",
"email": "test@example.com"
},
}],
confidence=0.95,
source_span="Customer data from document..."
)
@ -54,7 +54,7 @@ class TestObjectsCassandraContracts:
# Verify types
assert isinstance(test_object.schema_name, str)
assert isinstance(test_object.values, dict)
assert isinstance(test_object.values, list)
assert isinstance(test_object.confidence, float)
assert isinstance(test_object.source_span, str)
@ -200,7 +200,7 @@ class TestObjectsCassandraContracts:
metadata=[]
),
schema_name="test_schema",
values={"field1": "value1", "field2": "123"},
values=[{"field1": "value1", "field2": "123"}],
confidence=0.85,
source_span="Test span"
)
@ -292,7 +292,7 @@ class TestObjectsCassandraContracts:
metadata=[{"key": "value"}]
),
schema_name="table789", # -> table name
values={"field": "value"},
values=[{"field": "value"}],
confidence=0.9,
source_span="Source"
)
@ -303,4 +303,215 @@ class TestObjectsCassandraContracts:
# - metadata.collection -> Part of primary key
assert test_obj.metadata.user # Required for keyspace
assert test_obj.schema_name # Required for table
assert test_obj.metadata.collection # Required for partition key
assert test_obj.metadata.collection # Required for partition key
@pytest.mark.contract
class TestObjectsCassandraContractsBatch:
"""Contract tests for Cassandra object storage batch processing"""
def test_extracted_object_batch_input_contract(self):
"""Test that batched ExtractedObject schema matches expected input format"""
# Create test object with multiple values in batch
test_metadata = Metadata(
id="batch-doc-001",
user="test_user",
collection="test_collection",
metadata=[]
)
batch_object = ExtractedObject(
metadata=test_metadata,
schema_name="customer_records",
values=[
{
"customer_id": "CUST123",
"name": "Test Customer 1",
"email": "test1@example.com"
},
{
"customer_id": "CUST124",
"name": "Test Customer 2",
"email": "test2@example.com"
},
{
"customer_id": "CUST125",
"name": "Test Customer 3",
"email": "test3@example.com"
}
],
confidence=0.88,
source_span="Multiple customer data from document..."
)
# Verify batch structure
assert hasattr(batch_object, 'values')
assert isinstance(batch_object.values, list)
assert len(batch_object.values) == 3
# Verify each batch item is a dict
for i, batch_item in enumerate(batch_object.values):
assert isinstance(batch_item, dict)
assert "customer_id" in batch_item
assert "name" in batch_item
assert "email" in batch_item
assert batch_item["customer_id"] == f"CUST12{3+i}"
assert f"Test Customer {i+1}" in batch_item["name"]
def test_extracted_object_empty_batch_contract(self):
"""Test empty batch ExtractedObject contract"""
test_metadata = Metadata(
id="empty-batch-001",
user="test_user",
collection="test_collection",
metadata=[]
)
empty_batch_object = ExtractedObject(
metadata=test_metadata,
schema_name="empty_schema",
values=[], # Empty batch
confidence=1.0,
source_span="No objects found in document"
)
# Verify empty batch structure
assert hasattr(empty_batch_object, 'values')
assert isinstance(empty_batch_object.values, list)
assert len(empty_batch_object.values) == 0
assert empty_batch_object.confidence == 1.0
def test_extracted_object_single_item_batch_contract(self):
"""Test single-item batch (backward compatibility) contract"""
test_metadata = Metadata(
id="single-batch-001",
user="test_user",
collection="test_collection",
metadata=[]
)
single_batch_object = ExtractedObject(
metadata=test_metadata,
schema_name="customer_records",
values=[{ # Array with single item for backward compatibility
"customer_id": "CUST999",
"name": "Single Customer",
"email": "single@example.com"
}],
confidence=0.95,
source_span="Single customer data from document..."
)
# Verify single-item batch structure
assert isinstance(single_batch_object.values, list)
assert len(single_batch_object.values) == 1
assert isinstance(single_batch_object.values[0], dict)
assert single_batch_object.values[0]["customer_id"] == "CUST999"
def test_extracted_object_batch_serialization_contract(self):
"""Test that batched ExtractedObject can be serialized/deserialized correctly"""
# Create batch object
original = ExtractedObject(
metadata=Metadata(
id="batch-serial-001",
user="test_user",
collection="test_coll",
metadata=[]
),
schema_name="test_schema",
values=[
{"field1": "value1", "field2": "123"},
{"field1": "value2", "field2": "456"},
{"field1": "value3", "field2": "789"}
],
confidence=0.92,
source_span="Batch test span"
)
# Test serialization using schema
schema = AvroSchema(ExtractedObject)
# Encode and decode
encoded = schema.encode(original)
decoded = schema.decode(encoded)
# Verify round-trip for batch
assert decoded.metadata.id == original.metadata.id
assert decoded.metadata.user == original.metadata.user
assert decoded.metadata.collection == original.metadata.collection
assert decoded.schema_name == original.schema_name
assert len(decoded.values) == len(original.values)
assert len(decoded.values) == 3
# Verify each batch item
for i in range(3):
assert decoded.values[i] == original.values[i]
assert decoded.values[i]["field1"] == f"value{i+1}"
assert decoded.values[i]["field2"] == f"{123 + i*333}"
assert decoded.confidence == original.confidence
assert decoded.source_span == original.source_span
def test_batch_processing_field_validation_contract(self):
"""Test that batch processing validates field consistency"""
# All batch items should have consistent field structure
# This is a contract that the application should enforce
# Valid batch - all items have same fields
valid_batch_values = [
{"id": "1", "name": "Item 1", "value": "100"},
{"id": "2", "name": "Item 2", "value": "200"},
{"id": "3", "name": "Item 3", "value": "300"}
]
# Each item has the same field structure
field_sets = [set(item.keys()) for item in valid_batch_values]
assert all(fields == field_sets[0] for fields in field_sets), "All batch items should have consistent fields"
# Invalid batch - inconsistent fields (this would be caught by application logic)
invalid_batch_values = [
{"id": "1", "name": "Item 1", "value": "100"},
{"id": "2", "name": "Item 2"}, # Missing 'value' field
{"id": "3", "name": "Item 3", "value": "300", "extra": "field"} # Extra field
]
# Demonstrate the inconsistency
invalid_field_sets = [set(item.keys()) for item in invalid_batch_values]
assert not all(fields == invalid_field_sets[0] for fields in invalid_field_sets), "Invalid batch should have inconsistent fields"
def test_batch_storage_partition_key_contract(self):
"""Test that batch objects maintain partition key consistency"""
# In Cassandra storage, all objects in a batch should:
# 1. Belong to the same collection (partition key component)
# 2. Have unique primary keys within the batch
# 3. Be stored in the same keyspace (user)
test_metadata = Metadata(
id="partition-test-001",
user="consistent_user", # Same keyspace
collection="consistent_collection", # Same partition
metadata=[]
)
batch_object = ExtractedObject(
metadata=test_metadata,
schema_name="partition_test",
values=[
{"id": "pk1", "data": "data1"}, # Unique primary key
{"id": "pk2", "data": "data2"}, # Unique primary key
{"id": "pk3", "data": "data3"} # Unique primary key
],
confidence=0.95,
source_span="Partition consistency test"
)
# Verify consistency contract
assert batch_object.metadata.user # Must have user for keyspace
assert batch_object.metadata.collection # Must have collection for partition key
# Verify unique primary keys in batch
primary_keys = [item["id"] for item in batch_object.values]
assert len(primary_keys) == len(set(primary_keys)), "Primary keys must be unique within batch"
# All batch items will be stored in same keyspace and partition
# This is enforced by the metadata.user and metadata.collection being shared