Object batching (#499)

* Object batching

* Update tests
This commit is contained in:
cybermaggedon 2025-09-05 15:59:06 +01:00 committed by GitHub
parent ebca467ed8
commit 0b7620bc04
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 946 additions and 107 deletions

View file

@ -66,11 +66,11 @@ def sample_objects_message():
"collection": "testcollection"
},
"schema_name": "person",
"values": {
"values": [{
"name": "John Doe",
"age": "30",
"city": "New York"
},
}],
"confidence": 0.95,
"source_span": "John Doe, age 30, lives in New York"
}
@ -86,9 +86,9 @@ def minimal_objects_message():
"collection": "testcollection"
},
"schema_name": "simple_schema",
"values": {
"values": [{
"field1": "value1"
}
}]
}
@ -235,8 +235,8 @@ class TestObjectsImportMessageProcessing:
sent_object = call_args[0][1]
assert isinstance(sent_object, ExtractedObject)
assert sent_object.schema_name == "person"
assert sent_object.values["name"] == "John Doe"
assert sent_object.values["age"] == "30"
assert sent_object.values[0]["name"] == "John Doe"
assert sent_object.values[0]["age"] == "30"
assert sent_object.confidence == 0.95
assert sent_object.source_span == "John Doe, age 30, lives in New York"
@ -274,7 +274,7 @@ class TestObjectsImportMessageProcessing:
sent_object = mock_publisher_instance.send.call_args[0][1]
assert isinstance(sent_object, ExtractedObject)
assert sent_object.schema_name == "simple_schema"
assert sent_object.values["field1"] == "value1"
assert sent_object.values[0]["field1"] == "value1"
assert sent_object.confidence == 1.0 # Default value
assert sent_object.source_span == "" # Default value
assert len(sent_object.metadata.metadata) == 0 # Default empty list
@ -302,7 +302,7 @@ class TestObjectsImportMessageProcessing:
"collection": "testcollection"
},
"schema_name": "test_schema",
"values": {"key": "value"}
"values": [{"key": "value"}]
# No confidence or source_span
}
@ -374,6 +374,134 @@ class TestObjectsImportRunMethod:
assert objects_import.ws is None
class TestObjectsImportBatchProcessing:
"""Test ObjectsImport batch processing functionality."""
@pytest.fixture
def batch_objects_message(self):
"""Sample batch objects message data."""
return {
"metadata": {
"id": "batch-001",
"metadata": [
{
"s": {"v": "batch-001", "e": False},
"p": {"v": "source", "e": False},
"o": {"v": "test", "e": False}
}
],
"user": "testuser",
"collection": "testcollection"
},
"schema_name": "person",
"values": [
{
"name": "John Doe",
"age": "30",
"city": "New York"
},
{
"name": "Jane Smith",
"age": "25",
"city": "Boston"
},
{
"name": "Bob Johnson",
"age": "45",
"city": "Chicago"
}
],
"confidence": 0.85,
"source_span": "Multiple people found in document"
}
@patch('trustgraph.gateway.dispatch.objects_import.Publisher')
@pytest.mark.asyncio
async def test_receive_processes_batch_message_correctly(self, mock_publisher_class, mock_pulsar_client, mock_websocket, mock_running, batch_objects_message):
"""Test that receive() processes batch message correctly."""
mock_publisher_instance = Mock()
mock_publisher_instance.send = AsyncMock()
mock_publisher_class.return_value = mock_publisher_instance
objects_import = ObjectsImport(
ws=mock_websocket,
running=mock_running,
pulsar_client=mock_pulsar_client,
queue="test-queue"
)
# Create mock message
mock_msg = Mock()
mock_msg.json.return_value = batch_objects_message
await objects_import.receive(mock_msg)
# Verify publisher.send was called
mock_publisher_instance.send.assert_called_once()
# Get the call arguments
call_args = mock_publisher_instance.send.call_args
assert call_args[0][0] is None # First argument should be None
# Check the ExtractedObject that was sent
sent_object = call_args[0][1]
assert isinstance(sent_object, ExtractedObject)
assert sent_object.schema_name == "person"
# Check that all batch values are present
assert len(sent_object.values) == 3
assert sent_object.values[0]["name"] == "John Doe"
assert sent_object.values[0]["age"] == "30"
assert sent_object.values[0]["city"] == "New York"
assert sent_object.values[1]["name"] == "Jane Smith"
assert sent_object.values[1]["age"] == "25"
assert sent_object.values[1]["city"] == "Boston"
assert sent_object.values[2]["name"] == "Bob Johnson"
assert sent_object.values[2]["age"] == "45"
assert sent_object.values[2]["city"] == "Chicago"
assert sent_object.confidence == 0.85
assert sent_object.source_span == "Multiple people found in document"
@patch('trustgraph.gateway.dispatch.objects_import.Publisher')
@pytest.mark.asyncio
async def test_receive_handles_empty_batch(self, mock_publisher_class, mock_pulsar_client, mock_websocket, mock_running):
"""Test that receive() handles empty batch correctly."""
mock_publisher_instance = Mock()
mock_publisher_instance.send = AsyncMock()
mock_publisher_class.return_value = mock_publisher_instance
objects_import = ObjectsImport(
ws=mock_websocket,
running=mock_running,
pulsar_client=mock_pulsar_client,
queue="test-queue"
)
# Message with empty values array
empty_batch_message = {
"metadata": {
"id": "empty-batch-001",
"user": "testuser",
"collection": "testcollection"
},
"schema_name": "empty_schema",
"values": []
}
mock_msg = Mock()
mock_msg.json.return_value = empty_batch_message
await objects_import.receive(mock_msg)
# Should still send the message
mock_publisher_instance.send.assert_called_once()
sent_object = mock_publisher_instance.send.call_args[0][1]
assert len(sent_object.values) == 0
class TestObjectsImportErrorHandling:
"""Test error handling in ObjectsImport."""

View file

@ -317,12 +317,12 @@ class TestObjectExtractionBusinessLogic:
metadata=[]
)
values = {
values = [{
"customer_id": "CUST001",
"name": "John Doe",
"email": "john@example.com",
"status": "active"
}
}]
# Act
extracted_obj = ExtractedObject(
@ -335,7 +335,7 @@ class TestObjectExtractionBusinessLogic:
# Assert
assert extracted_obj.schema_name == "customer_records"
assert extracted_obj.values["customer_id"] == "CUST001"
assert extracted_obj.values[0]["customer_id"] == "CUST001"
assert extracted_obj.confidence == 0.95
assert "John Doe" in extracted_obj.source_span
assert extracted_obj.metadata.user == "test_user"

View file

@ -261,7 +261,7 @@ class TestObjectsCassandraStorageLogic:
metadata=[]
),
schema_name="test_schema",
values={"id": "123", "value": "456"},
values=[{"id": "123", "value": "456"}],
confidence=0.9,
source_span="test source"
)
@ -284,8 +284,8 @@ class TestObjectsCassandraStorageLogic:
assert "INSERT INTO test_user.o_test_schema" in insert_cql
assert "collection" in insert_cql
assert values[0] == "test_collection" # collection value
assert values[1] == "123" # id value
assert values[2] == 456 # converted integer value
assert values[1] == "123" # id value (from values[0])
assert values[2] == 456 # converted integer value (from values[0])
def test_secondary_index_creation(self):
"""Test that secondary indexes are created for indexed fields"""
@ -325,4 +325,201 @@ class TestObjectsCassandraStorageLogic:
index_calls = [call[0][0] for call in calls if "CREATE INDEX" in call[0][0]]
assert len(index_calls) == 2
assert any("o_products_category_idx" in call for call in index_calls)
assert any("o_products_price_idx" in call for call in index_calls)
assert any("o_products_price_idx" in call for call in index_calls)
class TestObjectsCassandraStorageBatchLogic:
"""Test batch processing logic in Cassandra storage"""
@pytest.mark.asyncio
async def test_batch_object_processing_logic(self):
"""Test processing of batch ExtractedObjects"""
processor = MagicMock()
processor.schemas = {
"batch_schema": RowSchema(
name="batch_schema",
description="Test batch schema",
fields=[
Field(name="id", type="string", size=50, primary=True),
Field(name="name", type="string", size=100),
Field(name="value", type="integer", size=4)
]
)
}
processor.ensure_table = MagicMock()
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
processor.session = MagicMock()
processor.on_object = Processor.on_object.__get__(processor, Processor)
# Create batch object with multiple values
batch_obj = ExtractedObject(
metadata=Metadata(
id="batch-001",
user="test_user",
collection="batch_collection",
metadata=[]
),
schema_name="batch_schema",
values=[
{"id": "001", "name": "First", "value": "100"},
{"id": "002", "name": "Second", "value": "200"},
{"id": "003", "name": "Third", "value": "300"}
],
confidence=0.95,
source_span="batch source"
)
# Create mock message
msg = MagicMock()
msg.value.return_value = batch_obj
# Process batch object
await processor.on_object(msg, None, None)
# Verify table was ensured once
processor.ensure_table.assert_called_once_with("test_user", "batch_schema", processor.schemas["batch_schema"])
# Verify 3 separate insert calls (one per batch item)
assert processor.session.execute.call_count == 3
# Check each insert call
calls = processor.session.execute.call_args_list
for i, call in enumerate(calls):
insert_cql = call[0][0]
values = call[0][1]
assert "INSERT INTO test_user.o_batch_schema" in insert_cql
assert "collection" in insert_cql
# Check values for each batch item
assert values[0] == "batch_collection" # collection
assert values[1] == f"00{i+1}" # id from batch item i
assert values[2] == f"First" if i == 0 else f"Second" if i == 1 else f"Third" # name
assert values[3] == (i+1) * 100 # converted integer value
@pytest.mark.asyncio
async def test_empty_batch_processing_logic(self):
"""Test processing of empty batch ExtractedObjects"""
processor = MagicMock()
processor.schemas = {
"empty_schema": RowSchema(
name="empty_schema",
fields=[Field(name="id", type="string", size=50, primary=True)]
)
}
processor.ensure_table = MagicMock()
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
processor.session = MagicMock()
processor.on_object = Processor.on_object.__get__(processor, Processor)
# Create empty batch object
empty_batch_obj = ExtractedObject(
metadata=Metadata(
id="empty-001",
user="test_user",
collection="empty_collection",
metadata=[]
),
schema_name="empty_schema",
values=[], # Empty batch
confidence=1.0,
source_span="empty source"
)
msg = MagicMock()
msg.value.return_value = empty_batch_obj
# Process empty batch object
await processor.on_object(msg, None, None)
# Verify table was ensured
processor.ensure_table.assert_called_once()
# Verify no insert calls for empty batch
processor.session.execute.assert_not_called()
@pytest.mark.asyncio
async def test_single_item_batch_processing_logic(self):
"""Test processing of single-item batch (backward compatibility)"""
processor = MagicMock()
processor.schemas = {
"single_schema": RowSchema(
name="single_schema",
fields=[
Field(name="id", type="string", size=50, primary=True),
Field(name="data", type="string", size=100)
]
)
}
processor.ensure_table = MagicMock()
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
processor.session = MagicMock()
processor.on_object = Processor.on_object.__get__(processor, Processor)
# Create single-item batch object (backward compatibility case)
single_batch_obj = ExtractedObject(
metadata=Metadata(
id="single-001",
user="test_user",
collection="single_collection",
metadata=[]
),
schema_name="single_schema",
values=[{"id": "single-1", "data": "single data"}], # Array with one item
confidence=0.8,
source_span="single source"
)
msg = MagicMock()
msg.value.return_value = single_batch_obj
# Process single-item batch object
await processor.on_object(msg, None, None)
# Verify table was ensured
processor.ensure_table.assert_called_once()
# Verify exactly one insert call
processor.session.execute.assert_called_once()
insert_cql = processor.session.execute.call_args[0][0]
values = processor.session.execute.call_args[0][1]
assert "INSERT INTO test_user.o_single_schema" in insert_cql
assert values[0] == "single_collection" # collection
assert values[1] == "single-1" # id value
assert values[2] == "single data" # data value
def test_batch_value_conversion_logic(self):
"""Test value conversion works correctly for batch items"""
processor = MagicMock()
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
# Test various conversion scenarios that would occur in batch processing
test_cases = [
# Integer conversions for batch items
("123", "integer", 123),
("456", "integer", 456),
("789", "integer", 789),
# Float conversions for batch items
("12.5", "float", 12.5),
("34.7", "float", 34.7),
# Boolean conversions for batch items
("true", "boolean", True),
("false", "boolean", False),
("1", "boolean", True),
("0", "boolean", False),
# String conversions for batch items
(123, "string", "123"),
(45.6, "string", "45.6"),
]
for input_val, field_type, expected_output in test_cases:
result = processor.convert_value(input_val, field_type)
assert result == expected_output, f"Failed for {input_val} -> {field_type}: got {result}, expected {expected_output}"