trustgraph/tests/integration/test_rows_graphql_query_integration.py

"""
Integration tests for Rows GraphQL Query Service

These tests verify end-to-end functionality including:
- Real Cassandra database operations
- Full GraphQL query execution
- Schema generation and configuration handling
- Message processing with actual Pulsar schemas
"""

import pytest
import json
import asyncio
from unittest.mock import MagicMock, AsyncMock

# Check if Docker/testcontainers is available
try:
    from testcontainers.cassandra import CassandraContainer
    import docker
    # Test Docker connection
    docker.from_env().ping()
    DOCKER_AVAILABLE = True
except Exception:
    DOCKER_AVAILABLE = False
    CassandraContainer = None

from trustgraph.query.rows.cassandra.service import Processor
from trustgraph.schema import RowsQueryRequest, RowsQueryResponse, GraphQLError
from trustgraph.schema import RowSchema, Field, ExtractedObject, Metadata


@pytest.mark.integration
@pytest.mark.skipif(not DOCKER_AVAILABLE, reason="Docker/testcontainers not available")
class TestObjectsGraphQLQueryIntegration:
    """Integration tests with real Cassandra database"""

    @pytest.fixture(scope="class")
    def cassandra_container(self):
        """Start Cassandra container for testing"""
        if not DOCKER_AVAILABLE:
            pytest.skip("Docker/testcontainers not available")

        with CassandraContainer("cassandra:3.11") as cassandra:
            # Wait for Cassandra to be ready
            cassandra.get_connection_url()
            yield cassandra

    @pytest.fixture
    def processor(self, cassandra_container):
        """Create processor with real Cassandra connection"""
        # Extract host and port from container
        host = cassandra_container.get_container_host_ip()
        port = cassandra_container.get_exposed_port(9042)

        # Create processor
        processor = Processor(
            id="test-graphql-query",
            graph_host=host,
            # Note: testcontainer typically doesn't require auth
            graph_username=None,
            graph_password=None,
            config_type="schema"
        )

        # Override connection parameters for test container
        processor.graph_host = host
        processor.cluster = None
        processor.session = None

        return processor

    @pytest.fixture
    def sample_schema_config(self):
        """Sample schema configuration for testing"""
        return {
            "schema": {
                "customer": json.dumps({
                    "name": "customer",
                    "description": "Customer records",
                    "fields": [
                        {
                            "name": "customer_id",
                            "type": "string",
                            "primary_key": True,
                            "required": True,
                            "description": "Customer identifier"
                        },
                        {
                            "name": "name",
                            "type": "string",
                            "required": True,
                            "indexed": True,
                            "description": "Customer name"
                        },
                        {
                            "name": "email",
                            "type": "string",
                            "required": True,
                            "indexed": True,
                            "description": "Customer email"
                        },
                        {
                            "name": "status",
                            "type": "string",
                            "required": False,
                            "indexed": True,
                            "enum": ["active", "inactive", "pending"],
                            "description": "Customer status"
                        },
                        {
                            "name": "created_date",
                            "type": "timestamp",
                            "required": False,
                            "description": "Registration date"
                        }
                    ]
                }),
                "order": json.dumps({
                    "name": "order",
                    "description": "Order records",
                    "fields": [
                        {
                            "name": "order_id",
                            "type": "string",
                            "primary_key": True,
                            "required": True
                        },
                        {
                            "name": "customer_id",
                            "type": "string",
                            "required": True,
                            "indexed": True,
                            "description": "Related customer"
                        },
                        {
                            "name": "total",
                            "type": "float",
                            "required": True,
                            "description": "Order total amount"
                        },
                        {
                            "name": "status",
                            "type": "string",
                            "indexed": True,
                            "enum": ["pending", "processing", "shipped", "delivered"],
                            "description": "Order status"
                        }
                    ]
                })
            }
        }

    @pytest.mark.asyncio
    async def test_schema_configuration_and_generation(self, processor, sample_schema_config):
        """Test schema configuration loading and GraphQL schema generation"""
        # Load schema configuration
        await processor.on_schema_config(sample_schema_config, version=1)

        # Verify schemas were loaded
        assert len(processor.schemas) == 2
        assert "customer" in processor.schemas
        assert "order" in processor.schemas

        # Verify customer schema
        customer_schema = processor.schemas["customer"]
        assert customer_schema.name == "customer"
        assert len(customer_schema.fields) == 5

        # Find primary key field
        pk_field = next((f for f in customer_schema.fields if f.primary), None)
        assert pk_field is not None
        assert pk_field.name == "customer_id"

        # Verify GraphQL schema was generated
        assert processor.graphql_schema is not None
        assert len(processor.graphql_types) == 2
        assert "customer" in processor.graphql_types
        assert "order" in processor.graphql_types

    @pytest.mark.asyncio
    async def test_cassandra_connection_and_table_creation(self, processor, sample_schema_config):
        """Test Cassandra connection and dynamic table creation"""
        # Load schema configuration
        await processor.on_schema_config(sample_schema_config, version=1)

        # Connect to Cassandra
        processor.connect_cassandra()
        assert processor.session is not None

        # Create test keyspace and table
        keyspace = "test_user"
        collection = "test_collection"
        schema_name = "customer"
        schema = processor.schemas[schema_name]

        # Ensure table creation
        processor.ensure_table(keyspace, schema_name, schema)

        # Verify keyspace and table tracking
        assert keyspace in processor.known_keyspaces
        assert keyspace in processor.known_tables

        # Verify table was created by querying Cassandra system tables
        safe_keyspace = processor.sanitize_name(keyspace)
        safe_table = processor.sanitize_table(schema_name)

        # Check if table exists
        table_query = """
        SELECT table_name FROM system_schema.tables
        WHERE keyspace_name = %s AND table_name = %s
        """
        result = processor.session.execute(table_query, (safe_keyspace, safe_table))
        rows = list(result)
        assert len(rows) == 1
        assert rows[0].table_name == safe_table

    @pytest.mark.asyncio
    async def test_data_insertion_and_graphql_query(self, processor, sample_schema_config):
        """Test inserting data and querying via GraphQL"""
        # Load schema and connect
        await processor.on_schema_config(sample_schema_config, version=1)
        processor.connect_cassandra()

        # Setup test data
        keyspace = "test_user"
        collection = "integration_test"
        schema_name = "customer"
        schema = processor.schemas[schema_name]

        # Ensure table exists
        processor.ensure_table(keyspace, schema_name, schema)

        # Insert test data directly (simulating what storage processor would do)
        safe_keyspace = processor.sanitize_name(keyspace)
        safe_table = processor.sanitize_table(schema_name)

        insert_query = f"""
        INSERT INTO {safe_keyspace}.{safe_table}
        (collection, customer_id, name, email, status, created_date)
        VALUES (%s, %s, %s, %s, %s, %s)
        """

        test_customers = [
            (collection, "CUST001", "John Doe", "john@example.com", "active", "2024-01-15"),
            (collection, "CUST002", "Jane Smith", "jane@example.com", "active", "2024-01-16"),
            (collection, "CUST003", "Bob Wilson", "bob@example.com", "inactive", "2024-01-17")
        ]

        for customer_data in test_customers:
            processor.session.execute(insert_query, customer_data)

        # Test GraphQL query execution
        graphql_query = '''
        {
            customer_objects(collection: "integration_test") {
                customer_id
                name
                email
                status
            }
        }
        '''

        result = await processor.execute_graphql_query(
            query=graphql_query,
            variables={},
            operation_name=None,
            user=keyspace,
            collection=collection
        )

        # Verify query results
        assert "data" in result
        assert "customer_objects" in result["data"]

        customers = result["data"]["customer_objects"]
        assert len(customers) == 3

        # Verify customer data
        customer_ids = [c["customer_id"] for c in customers]
        assert "CUST001" in customer_ids
        assert "CUST002" in customer_ids
        assert "CUST003" in customer_ids

        # Find specific customer and verify fields
        john = next(c for c in customers if c["customer_id"] == "CUST001")
        assert john["name"] == "John Doe"
        assert john["email"] == "john@example.com"
        assert john["status"] == "active"

    @pytest.mark.asyncio
    async def test_graphql_query_with_filters(self, processor, sample_schema_config):
        """Test GraphQL queries with filtering on indexed fields"""
        # Setup (reuse previous setup)
        await processor.on_schema_config(sample_schema_config, version=1)
        processor.connect_cassandra()

        keyspace = "test_user"
        collection = "filter_test"
        schema_name = "customer"
        schema = processor.schemas[schema_name]

        processor.ensure_table(keyspace, schema_name, schema)

        # Insert test data
        safe_keyspace = processor.sanitize_name(keyspace)
        safe_table = processor.sanitize_table(schema_name)

        insert_query = f"""
        INSERT INTO {safe_keyspace}.{safe_table}
        (collection, customer_id, name, email, status)
        VALUES (%s, %s, %s, %s, %s)
        """

        test_data = [
            (collection, "A001", "Active User 1", "active1@test.com", "active"),
            (collection, "A002", "Active User 2", "active2@test.com", "active"),
            (collection, "I001", "Inactive User", "inactive@test.com", "inactive")
        ]

        for data in test_data:
            processor.session.execute(insert_query, data)

        # Query with status filter (indexed field)
        filtered_query = '''
        {
            customer_objects(collection: "filter_test", status: "active") {
                customer_id
                name
                status
            }
        }
        '''

        result = await processor.execute_graphql_query(
            query=filtered_query,
            variables={},
            operation_name=None,
            user=keyspace,
            collection=collection
        )

        # Verify filtered results
        assert "data" in result
        customers = result["data"]["customer_objects"]
        assert len(customers) == 2  # Only active customers

        for customer in customers:
            assert customer["status"] == "active"
            assert customer["customer_id"] in ["A001", "A002"]

    @pytest.mark.asyncio
    async def test_graphql_error_handling(self, processor, sample_schema_config):
        """Test GraphQL error handling for invalid queries"""
        # Setup
        await processor.on_schema_config(sample_schema_config, version=1)

        # Test invalid field query
        invalid_query = '''
        {
            customer_objects {
                customer_id
                nonexistent_field
            }
        }
        '''

        result = await processor.execute_graphql_query(
            query=invalid_query,
            variables={},
            operation_name=None,
            user="test_user",
            collection="test_collection"
        )

        # Verify error response
        assert "errors" in result
        assert len(result["errors"]) > 0

        error = result["errors"][0]
        assert "message" in error
        # GraphQL error should mention the invalid field
        assert "nonexistent_field" in error["message"] or "Cannot query field" in error["message"]

    @pytest.mark.asyncio
    async def test_message_processing_integration(self, processor, sample_schema_config):
        """Test full message processing workflow"""
        # Setup
        await processor.on_schema_config(sample_schema_config, version=1)
        processor.connect_cassandra()

        # Create mock message
        request = RowsQueryRequest(
            user="msg_test_user",
            collection="msg_test_collection",
            query='{ customer_objects { customer_id name } }',
            variables={},
            operation_name=""
        )

        mock_msg = MagicMock()
        mock_msg.value.return_value = request
        mock_msg.properties.return_value = {"id": "integration-test-123"}

        # Mock flow for response
        mock_response_producer = AsyncMock()
        mock_flow = MagicMock()
        mock_flow.return_value = mock_response_producer

        # Process message
        await processor.on_message(mock_msg, None, mock_flow)

        # Verify response was sent
        mock_response_producer.send.assert_called_once()

        # Verify response structure
        sent_response = mock_response_producer.send.call_args[0][0]
        assert isinstance(sent_response, RowsQueryResponse)

        # Should have no system error (even if no data)
        assert sent_response.error is None

        # Data should be JSON string (even if empty result)
        assert sent_response.data is not None
        assert isinstance(sent_response.data, str)

        # Should be able to parse as JSON
        parsed_data = json.loads(sent_response.data)
        assert isinstance(parsed_data, dict)

    @pytest.mark.asyncio
    async def test_concurrent_queries(self, processor, sample_schema_config):
        """Test handling multiple concurrent GraphQL queries"""
        # Setup
        await processor.on_schema_config(sample_schema_config, version=1)
        processor.connect_cassandra()

        # Create multiple query tasks
        queries = [
            '{ customer_objects { customer_id } }',
            '{ order_objects { order_id } }',
            '{ customer_objects { name email } }',
            '{ order_objects { total status } }'
        ]

        # Execute queries concurrently
        tasks = []
        for i, query in enumerate(queries):
            task = processor.execute_graphql_query(
                query=query,
                variables={},
                operation_name=None,
                user=f"concurrent_user_{i}",
                collection=f"concurrent_collection_{i}"
            )
            tasks.append(task)

        # Wait for all queries to complete
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Verify all queries completed without exceptions
        for i, result in enumerate(results):
            assert not isinstance(result, Exception), f"Query {i} failed: {result}"
            assert "data" in result or "errors" in result

    @pytest.mark.asyncio
    async def test_schema_update_handling(self, processor):
        """Test handling of schema configuration updates"""
        # Load initial schema
        initial_config = {
            "schema": {
                "simple": json.dumps({
                    "name": "simple",
                    "fields": [{"name": "id", "type": "string", "primary_key": True}]
                })
            }
        }

        await processor.on_schema_config(initial_config, version=1)
        assert len(processor.schemas) == 1
        assert "simple" in processor.schemas

        # Update with additional schema
        updated_config = {
            "schema": {
                "simple": json.dumps({
                    "name": "simple",
                    "fields": [
                        {"name": "id", "type": "string", "primary_key": True},
                        {"name": "name", "type": "string"}  # New field
                    ]
                }),
                "complex": json.dumps({
                    "name": "complex",
                    "fields": [
                        {"name": "id", "type": "string", "primary_key": True},
                        {"name": "data", "type": "string"}
                    ]
                })
            }
        }

        await processor.on_schema_config(updated_config, version=2)

        # Verify updated schemas
        assert len(processor.schemas) == 2
        assert "simple" in processor.schemas
        assert "complex" in processor.schemas

        # Verify simple schema was updated
        simple_schema = processor.schemas["simple"]
        assert len(simple_schema.fields) == 2

        # Verify GraphQL schema was regenerated
        assert len(processor.graphql_types) == 2

    @pytest.mark.asyncio
    async def test_large_result_set_handling(self, processor, sample_schema_config):
        """Test handling of large query result sets"""
        # Setup
        await processor.on_schema_config(sample_schema_config, version=1)
        processor.connect_cassandra()

        keyspace = "large_test_user"
        collection = "large_collection"
        schema_name = "customer"
        schema = processor.schemas[schema_name]

        processor.ensure_table(keyspace, schema_name, schema)

        # Insert larger dataset
        safe_keyspace = processor.sanitize_name(keyspace)
        safe_table = processor.sanitize_table(schema_name)

        insert_query = f"""
        INSERT INTO {safe_keyspace}.{safe_table}
        (collection, customer_id, name, email, status)
        VALUES (%s, %s, %s, %s, %s)
        """

        # Insert 50 records
        for i in range(50):
            processor.session.execute(insert_query, (
                collection,
                f"CUST{i:03d}",
                f"Customer {i}",
                f"customer{i}@test.com",
                "active" if i % 2 == 0 else "inactive"
            ))

        # Query with limit
        limited_query = '''
        {
            customer_objects(collection: "large_collection", limit: 10) {
                customer_id
                name
            }
        }
        '''

        result = await processor.execute_graphql_query(
            query=limited_query,
            variables={},
            operation_name=None,
            user=keyspace,
            collection=collection
        )

        # Verify limited results
        assert "data" in result
        customers = result["data"]["customer_objects"]
        assert len(customers) <= 10  # Should be limited


@pytest.mark.integration
@pytest.mark.skipif(not DOCKER_AVAILABLE, reason="Docker/testcontainers not available")
class TestObjectsGraphQLQueryPerformance:
    """Performance-focused integration tests"""

    @pytest.mark.asyncio
    async def test_query_execution_timing(self, cassandra_container):
        """Test query execution performance and timeout handling"""
        import time

        # Create processor with shorter timeout for testing
        host = cassandra_container.get_container_host_ip()

        processor = Processor(
            id="perf-test-graphql-query",
            graph_host=host,
            config_type="schema"
        )

        # Load minimal schema
        schema_config = {
            "schema": {
                "perf_test": json.dumps({
                    "name": "perf_test",
                    "fields": [{"name": "id", "type": "string", "primary_key": True}]
                })
            }
        }

        await processor.on_schema_config(schema_config, version=1)

        # Measure query execution time
        start_time = time.time()

        result = await processor.execute_graphql_query(
            query='{ perf_test_objects { id } }',
            variables={},
            operation_name=None,
            user="perf_user",
            collection="perf_collection"
        )

        end_time = time.time()
        execution_time = end_time - start_time

        # Verify reasonable execution time (should be under 1 second for empty result)
        assert execution_time < 1.0

        # Verify result structure
        assert "data" in result or "errors" in result