trustgraph/tests/integration/test_rows_graphql_query_integration.py
cybermaggedon 1809c1f56d
Structured data 2 (#645)
* Structured data refactor - multi-index tables, remove need for manual mods to the Cassandra tables

* Tech spec updated to track implementation
2026-02-23 15:56:29 +00:00

624 lines
No EOL
22 KiB
Python

"""
Integration tests for Rows GraphQL Query Service
These tests verify end-to-end functionality including:
- Real Cassandra database operations
- Full GraphQL query execution
- Schema generation and configuration handling
- Message processing with actual Pulsar schemas
"""
import pytest
import json
import asyncio
from unittest.mock import MagicMock, AsyncMock
# Check if Docker/testcontainers is available
try:
from testcontainers.cassandra import CassandraContainer
import docker
# Test Docker connection
docker.from_env().ping()
DOCKER_AVAILABLE = True
except Exception:
DOCKER_AVAILABLE = False
CassandraContainer = None
from trustgraph.query.rows.cassandra.service import Processor
from trustgraph.schema import RowsQueryRequest, RowsQueryResponse, GraphQLError
from trustgraph.schema import RowSchema, Field, ExtractedObject, Metadata
@pytest.mark.integration
@pytest.mark.skipif(not DOCKER_AVAILABLE, reason="Docker/testcontainers not available")
class TestObjectsGraphQLQueryIntegration:
"""Integration tests with real Cassandra database"""
@pytest.fixture(scope="class")
def cassandra_container(self):
"""Start Cassandra container for testing"""
if not DOCKER_AVAILABLE:
pytest.skip("Docker/testcontainers not available")
with CassandraContainer("cassandra:3.11") as cassandra:
# Wait for Cassandra to be ready
cassandra.get_connection_url()
yield cassandra
@pytest.fixture
def processor(self, cassandra_container):
"""Create processor with real Cassandra connection"""
# Extract host and port from container
host = cassandra_container.get_container_host_ip()
port = cassandra_container.get_exposed_port(9042)
# Create processor
processor = Processor(
id="test-graphql-query",
graph_host=host,
# Note: testcontainer typically doesn't require auth
graph_username=None,
graph_password=None,
config_type="schema"
)
# Override connection parameters for test container
processor.graph_host = host
processor.cluster = None
processor.session = None
return processor
@pytest.fixture
def sample_schema_config(self):
"""Sample schema configuration for testing"""
return {
"schema": {
"customer": json.dumps({
"name": "customer",
"description": "Customer records",
"fields": [
{
"name": "customer_id",
"type": "string",
"primary_key": True,
"required": True,
"description": "Customer identifier"
},
{
"name": "name",
"type": "string",
"required": True,
"indexed": True,
"description": "Customer name"
},
{
"name": "email",
"type": "string",
"required": True,
"indexed": True,
"description": "Customer email"
},
{
"name": "status",
"type": "string",
"required": False,
"indexed": True,
"enum": ["active", "inactive", "pending"],
"description": "Customer status"
},
{
"name": "created_date",
"type": "timestamp",
"required": False,
"description": "Registration date"
}
]
}),
"order": json.dumps({
"name": "order",
"description": "Order records",
"fields": [
{
"name": "order_id",
"type": "string",
"primary_key": True,
"required": True
},
{
"name": "customer_id",
"type": "string",
"required": True,
"indexed": True,
"description": "Related customer"
},
{
"name": "total",
"type": "float",
"required": True,
"description": "Order total amount"
},
{
"name": "status",
"type": "string",
"indexed": True,
"enum": ["pending", "processing", "shipped", "delivered"],
"description": "Order status"
}
]
})
}
}
@pytest.mark.asyncio
async def test_schema_configuration_and_generation(self, processor, sample_schema_config):
"""Test schema configuration loading and GraphQL schema generation"""
# Load schema configuration
await processor.on_schema_config(sample_schema_config, version=1)
# Verify schemas were loaded
assert len(processor.schemas) == 2
assert "customer" in processor.schemas
assert "order" in processor.schemas
# Verify customer schema
customer_schema = processor.schemas["customer"]
assert customer_schema.name == "customer"
assert len(customer_schema.fields) == 5
# Find primary key field
pk_field = next((f for f in customer_schema.fields if f.primary), None)
assert pk_field is not None
assert pk_field.name == "customer_id"
# Verify GraphQL schema was generated
assert processor.graphql_schema is not None
assert len(processor.graphql_types) == 2
assert "customer" in processor.graphql_types
assert "order" in processor.graphql_types
@pytest.mark.asyncio
async def test_cassandra_connection_and_table_creation(self, processor, sample_schema_config):
"""Test Cassandra connection and dynamic table creation"""
# Load schema configuration
await processor.on_schema_config(sample_schema_config, version=1)
# Connect to Cassandra
processor.connect_cassandra()
assert processor.session is not None
# Create test keyspace and table
keyspace = "test_user"
collection = "test_collection"
schema_name = "customer"
schema = processor.schemas[schema_name]
# Ensure table creation
processor.ensure_table(keyspace, schema_name, schema)
# Verify keyspace and table tracking
assert keyspace in processor.known_keyspaces
assert keyspace in processor.known_tables
# Verify table was created by querying Cassandra system tables
safe_keyspace = processor.sanitize_name(keyspace)
safe_table = processor.sanitize_table(schema_name)
# Check if table exists
table_query = """
SELECT table_name FROM system_schema.tables
WHERE keyspace_name = %s AND table_name = %s
"""
result = processor.session.execute(table_query, (safe_keyspace, safe_table))
rows = list(result)
assert len(rows) == 1
assert rows[0].table_name == safe_table
@pytest.mark.asyncio
async def test_data_insertion_and_graphql_query(self, processor, sample_schema_config):
"""Test inserting data and querying via GraphQL"""
# Load schema and connect
await processor.on_schema_config(sample_schema_config, version=1)
processor.connect_cassandra()
# Setup test data
keyspace = "test_user"
collection = "integration_test"
schema_name = "customer"
schema = processor.schemas[schema_name]
# Ensure table exists
processor.ensure_table(keyspace, schema_name, schema)
# Insert test data directly (simulating what storage processor would do)
safe_keyspace = processor.sanitize_name(keyspace)
safe_table = processor.sanitize_table(schema_name)
insert_query = f"""
INSERT INTO {safe_keyspace}.{safe_table}
(collection, customer_id, name, email, status, created_date)
VALUES (%s, %s, %s, %s, %s, %s)
"""
test_customers = [
(collection, "CUST001", "John Doe", "john@example.com", "active", "2024-01-15"),
(collection, "CUST002", "Jane Smith", "jane@example.com", "active", "2024-01-16"),
(collection, "CUST003", "Bob Wilson", "bob@example.com", "inactive", "2024-01-17")
]
for customer_data in test_customers:
processor.session.execute(insert_query, customer_data)
# Test GraphQL query execution
graphql_query = '''
{
customer_objects(collection: "integration_test") {
customer_id
name
email
status
}
}
'''
result = await processor.execute_graphql_query(
query=graphql_query,
variables={},
operation_name=None,
user=keyspace,
collection=collection
)
# Verify query results
assert "data" in result
assert "customer_objects" in result["data"]
customers = result["data"]["customer_objects"]
assert len(customers) == 3
# Verify customer data
customer_ids = [c["customer_id"] for c in customers]
assert "CUST001" in customer_ids
assert "CUST002" in customer_ids
assert "CUST003" in customer_ids
# Find specific customer and verify fields
john = next(c for c in customers if c["customer_id"] == "CUST001")
assert john["name"] == "John Doe"
assert john["email"] == "john@example.com"
assert john["status"] == "active"
@pytest.mark.asyncio
async def test_graphql_query_with_filters(self, processor, sample_schema_config):
"""Test GraphQL queries with filtering on indexed fields"""
# Setup (reuse previous setup)
await processor.on_schema_config(sample_schema_config, version=1)
processor.connect_cassandra()
keyspace = "test_user"
collection = "filter_test"
schema_name = "customer"
schema = processor.schemas[schema_name]
processor.ensure_table(keyspace, schema_name, schema)
# Insert test data
safe_keyspace = processor.sanitize_name(keyspace)
safe_table = processor.sanitize_table(schema_name)
insert_query = f"""
INSERT INTO {safe_keyspace}.{safe_table}
(collection, customer_id, name, email, status)
VALUES (%s, %s, %s, %s, %s)
"""
test_data = [
(collection, "A001", "Active User 1", "active1@test.com", "active"),
(collection, "A002", "Active User 2", "active2@test.com", "active"),
(collection, "I001", "Inactive User", "inactive@test.com", "inactive")
]
for data in test_data:
processor.session.execute(insert_query, data)
# Query with status filter (indexed field)
filtered_query = '''
{
customer_objects(collection: "filter_test", status: "active") {
customer_id
name
status
}
}
'''
result = await processor.execute_graphql_query(
query=filtered_query,
variables={},
operation_name=None,
user=keyspace,
collection=collection
)
# Verify filtered results
assert "data" in result
customers = result["data"]["customer_objects"]
assert len(customers) == 2 # Only active customers
for customer in customers:
assert customer["status"] == "active"
assert customer["customer_id"] in ["A001", "A002"]
@pytest.mark.asyncio
async def test_graphql_error_handling(self, processor, sample_schema_config):
"""Test GraphQL error handling for invalid queries"""
# Setup
await processor.on_schema_config(sample_schema_config, version=1)
# Test invalid field query
invalid_query = '''
{
customer_objects {
customer_id
nonexistent_field
}
}
'''
result = await processor.execute_graphql_query(
query=invalid_query,
variables={},
operation_name=None,
user="test_user",
collection="test_collection"
)
# Verify error response
assert "errors" in result
assert len(result["errors"]) > 0
error = result["errors"][0]
assert "message" in error
# GraphQL error should mention the invalid field
assert "nonexistent_field" in error["message"] or "Cannot query field" in error["message"]
@pytest.mark.asyncio
async def test_message_processing_integration(self, processor, sample_schema_config):
"""Test full message processing workflow"""
# Setup
await processor.on_schema_config(sample_schema_config, version=1)
processor.connect_cassandra()
# Create mock message
request = RowsQueryRequest(
user="msg_test_user",
collection="msg_test_collection",
query='{ customer_objects { customer_id name } }',
variables={},
operation_name=""
)
mock_msg = MagicMock()
mock_msg.value.return_value = request
mock_msg.properties.return_value = {"id": "integration-test-123"}
# Mock flow for response
mock_response_producer = AsyncMock()
mock_flow = MagicMock()
mock_flow.return_value = mock_response_producer
# Process message
await processor.on_message(mock_msg, None, mock_flow)
# Verify response was sent
mock_response_producer.send.assert_called_once()
# Verify response structure
sent_response = mock_response_producer.send.call_args[0][0]
assert isinstance(sent_response, RowsQueryResponse)
# Should have no system error (even if no data)
assert sent_response.error is None
# Data should be JSON string (even if empty result)
assert sent_response.data is not None
assert isinstance(sent_response.data, str)
# Should be able to parse as JSON
parsed_data = json.loads(sent_response.data)
assert isinstance(parsed_data, dict)
@pytest.mark.asyncio
async def test_concurrent_queries(self, processor, sample_schema_config):
"""Test handling multiple concurrent GraphQL queries"""
# Setup
await processor.on_schema_config(sample_schema_config, version=1)
processor.connect_cassandra()
# Create multiple query tasks
queries = [
'{ customer_objects { customer_id } }',
'{ order_objects { order_id } }',
'{ customer_objects { name email } }',
'{ order_objects { total status } }'
]
# Execute queries concurrently
tasks = []
for i, query in enumerate(queries):
task = processor.execute_graphql_query(
query=query,
variables={},
operation_name=None,
user=f"concurrent_user_{i}",
collection=f"concurrent_collection_{i}"
)
tasks.append(task)
# Wait for all queries to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
# Verify all queries completed without exceptions
for i, result in enumerate(results):
assert not isinstance(result, Exception), f"Query {i} failed: {result}"
assert "data" in result or "errors" in result
@pytest.mark.asyncio
async def test_schema_update_handling(self, processor):
"""Test handling of schema configuration updates"""
# Load initial schema
initial_config = {
"schema": {
"simple": json.dumps({
"name": "simple",
"fields": [{"name": "id", "type": "string", "primary_key": True}]
})
}
}
await processor.on_schema_config(initial_config, version=1)
assert len(processor.schemas) == 1
assert "simple" in processor.schemas
# Update with additional schema
updated_config = {
"schema": {
"simple": json.dumps({
"name": "simple",
"fields": [
{"name": "id", "type": "string", "primary_key": True},
{"name": "name", "type": "string"} # New field
]
}),
"complex": json.dumps({
"name": "complex",
"fields": [
{"name": "id", "type": "string", "primary_key": True},
{"name": "data", "type": "string"}
]
})
}
}
await processor.on_schema_config(updated_config, version=2)
# Verify updated schemas
assert len(processor.schemas) == 2
assert "simple" in processor.schemas
assert "complex" in processor.schemas
# Verify simple schema was updated
simple_schema = processor.schemas["simple"]
assert len(simple_schema.fields) == 2
# Verify GraphQL schema was regenerated
assert len(processor.graphql_types) == 2
@pytest.mark.asyncio
async def test_large_result_set_handling(self, processor, sample_schema_config):
"""Test handling of large query result sets"""
# Setup
await processor.on_schema_config(sample_schema_config, version=1)
processor.connect_cassandra()
keyspace = "large_test_user"
collection = "large_collection"
schema_name = "customer"
schema = processor.schemas[schema_name]
processor.ensure_table(keyspace, schema_name, schema)
# Insert larger dataset
safe_keyspace = processor.sanitize_name(keyspace)
safe_table = processor.sanitize_table(schema_name)
insert_query = f"""
INSERT INTO {safe_keyspace}.{safe_table}
(collection, customer_id, name, email, status)
VALUES (%s, %s, %s, %s, %s)
"""
# Insert 50 records
for i in range(50):
processor.session.execute(insert_query, (
collection,
f"CUST{i:03d}",
f"Customer {i}",
f"customer{i}@test.com",
"active" if i % 2 == 0 else "inactive"
))
# Query with limit
limited_query = '''
{
customer_objects(collection: "large_collection", limit: 10) {
customer_id
name
}
}
'''
result = await processor.execute_graphql_query(
query=limited_query,
variables={},
operation_name=None,
user=keyspace,
collection=collection
)
# Verify limited results
assert "data" in result
customers = result["data"]["customer_objects"]
assert len(customers) <= 10 # Should be limited
@pytest.mark.integration
@pytest.mark.skipif(not DOCKER_AVAILABLE, reason="Docker/testcontainers not available")
class TestObjectsGraphQLQueryPerformance:
"""Performance-focused integration tests"""
@pytest.mark.asyncio
async def test_query_execution_timing(self, cassandra_container):
"""Test query execution performance and timeout handling"""
import time
# Create processor with shorter timeout for testing
host = cassandra_container.get_container_host_ip()
processor = Processor(
id="perf-test-graphql-query",
graph_host=host,
config_type="schema"
)
# Load minimal schema
schema_config = {
"schema": {
"perf_test": json.dumps({
"name": "perf_test",
"fields": [{"name": "id", "type": "string", "primary_key": True}]
})
}
}
await processor.on_schema_config(schema_config, version=1)
# Measure query execution time
start_time = time.time()
result = await processor.execute_graphql_query(
query='{ perf_test_objects { id } }',
variables={},
operation_name=None,
user="perf_user",
collection="perf_collection"
)
end_time = time.time()
execution_time = end_time - start_time
# Verify reasonable execution time (should be under 1 second for empty result)
assert execution_time < 1.0
# Verify result structure
assert "data" in result or "errors" in result