trustgraph/ai-context/trustgraph-templates/tests/validators/trustgraph.py

"""
TrustGraph configuration semantic validation.
"""

import json
from typing import Dict, Any, List, Tuple, Set


def validate_service_references(config: List[Dict[str, Any]]) -> List[str]:
    """
    Validate that configured services reference valid modules.

    Returns:
        List of error messages (empty if valid)
    """
    errors = []

    # Build set of known module names (this would need to be comprehensive)
    known_modules = {
        'pulsar', 'triple-store-cassandra', 'object-store-cassandra',
        'vector-store-qdrant', 'vector-store-milvus', 'vector-store-pinecone',
        'graph-rag', 'text-completion',
        'embeddings-hf', 'embeddings-fastembed', 'embeddings-openai',
        'openai', 'anthropic', 'ollama', 'bedrock', 'vertexai',
        'trustgraph-base', 'grafana', 'prometheus',
        'override-recursive-chunker', 'override-text-splitter',
        'neo4j', 'astra'
    }

    for idx, service in enumerate(config):
        if not isinstance(service, dict):
            errors.append(f"Configuration item {idx}: not a dictionary")
            continue

        name = service.get('name')
        if not name:
            errors.append(f"Configuration item {idx}: missing 'name' field")
        elif name not in known_modules:
            # This might be intentional for new modules, so just warn
            pass

    return errors


def validate_parameter_types(config: List[Dict[str, Any]]) -> List[str]:
    """
    Validate that module parameters are reasonable.

    Returns:
        List of error messages (empty if valid)
    """
    errors = []

    for idx, service in enumerate(config):
        if not isinstance(service, dict):
            continue

        name = service.get('name', f'item-{idx}')
        parameters = service.get('parameters', {})

        if not isinstance(parameters, dict):
            errors.append(f"Service '{name}': parameters must be a dictionary")
            continue

        # Check for common parameter issues
        for param_name, param_value in parameters.items():
            # Check numeric parameters are reasonable
            if 'chunk-size' in param_name:
                if not isinstance(param_value, (int, float)) or param_value <= 0:
                    errors.append(
                        f"Service '{name}': parameter '{param_name}' should be positive number"
                    )

            if 'chunk-overlap' in param_name:
                if not isinstance(param_value, (int, float)) or param_value < 0:
                    errors.append(
                        f"Service '{name}': parameter '{param_name}' should be non-negative number"
                    )

            if 'max-output-tokens' in param_name:
                if not isinstance(param_value, int) or param_value <= 0:
                    errors.append(
                        f"Service '{name}': parameter '{param_name}' should be positive integer"
                    )

            if 'temperature' in param_name:
                if not isinstance(param_value, (int, float)) or not (0 <= param_value <= 2):
                    errors.append(
                        f"Service '{name}': parameter '{param_name}' should be between 0 and 2"
                    )

    return errors


def validate_storage_consistency(config: List[Dict[str, Any]]) -> List[str]:
    """
    Validate that graph/object/vector stores are configured consistently.

    Returns:
        List of error messages (empty if valid)
    """
    errors = []

    service_names = [s.get('name') for s in config if isinstance(s, dict)]

    # Check for storage backends
    has_triple_store = any('triple-store' in name for name in service_names)
    has_object_store = any('object-store' in name for name in service_names)
    has_vector_store = any('vector-store' in name for name in service_names)

    # If using graph-rag, should have all three stores
    if 'graph-rag' in service_names:
        if not has_triple_store:
            errors.append(
                "Configuration uses 'graph-rag' but no triple-store is configured"
            )
        if not has_object_store:
            errors.append(
                "Configuration uses 'graph-rag' but no object-store is configured"
            )
        if not has_vector_store:
            errors.append(
                "Configuration uses 'graph-rag' but no vector-store is configured"
            )

    return errors


def validate_llm_configuration(config: List[Dict[str, Any]]) -> List[str]:
    """
    Validate LLM configuration is present and reasonable.

    Returns:
        List of error messages (empty if valid)
    """
    errors = []

    service_names = [s.get('name') for s in config if isinstance(s, dict)]

    # Check for at least one LLM provider
    llm_providers = {'openai', 'anthropic', 'ollama', 'bedrock', 'vertexai', 'vllm', 'llamacpp'}
    has_llm = any(name in llm_providers for name in service_names)

    if not has_llm:
        errors.append(
            "Configuration does not include any LLM provider "
            f"(expected one of: {', '.join(llm_providers)})"
        )

    # Check for embeddings
    has_embeddings = any('embeddings' in name for name in service_names)
    if not has_embeddings:
        errors.append(
            "Configuration does not include any embeddings provider"
        )

    return errors


def validate_required_structure(config: Any) -> List[str]:
    """
    Validate basic configuration structure.

    Handles both input format (list of services) and output format (dict).

    Returns:
        List of error messages (empty if valid)
    """
    errors = []

    # Handle output format (dict with tools, collection, etc.)
    if isinstance(config, dict):
        # Just check it's not empty
        if not config:
            errors.append("Configuration is empty")
        return errors

    # Handle input format (list of services)
    if not isinstance(config, list):
        errors.append("Configuration must be a list or dict")
        return errors

    if not config:
        errors.append("Configuration is empty")

    for idx, service in enumerate(config):
        if not isinstance(service, dict):
            errors.append(f"Configuration item {idx}: must be a dictionary")
            continue

        if 'name' not in service:
            errors.append(f"Configuration item {idx}: missing required field 'name'")

        if 'parameters' not in service:
            errors.append(f"Configuration item {idx}: missing required field 'parameters'")

    return errors


def parse_trustgraph_config(json_content: str):
    """
    Parse TrustGraph configuration JSON.

    Args:
        json_content: JSON string

    Returns:
        Configuration (dict or list depending on format)
    """
    return json.loads(json_content)


def validate_trustgraph_config(json_content: str) -> Tuple[bool, List[str]]:
    """
    Comprehensive validation of TrustGraph configuration.

    Args:
        json_content: JSON string of TrustGraph configuration

    Returns:
        Tuple of (is_valid, list_of_errors)
    """
    try:
        config = parse_trustgraph_config(json_content)
    except json.JSONDecodeError as e:
        return False, [f"JSON parsing error: {e}"]

    errors = []
    errors.extend(validate_required_structure(config))
    errors.extend(validate_service_references(config))
    errors.extend(validate_parameter_types(config))
    errors.extend(validate_storage_consistency(config))
    errors.extend(validate_llm_configuration(config))

    return len(errors) == 0, errors