plano/tests/e2e/test_model_alias_routing.py

import anthropic
import openai
import os
import logging
import pytest
import sys

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

LLM_GATEWAY_ENDPOINT = os.getenv(
    "LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
)

# =============================================================================
# MODEL ALIAS TESTS
# =============================================================================


def test_assistant_message_with_null_content_and_tool_calls():
    """Test that assistant messages with null content and tool_calls are properly handled"""
    logger.info(
        "Testing assistant message with null content and tool_calls (multi-turn conversation)"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    # Simulate a multi-turn conversation where:
    # 1. User asks a question
    # 2. Assistant makes a tool call (with null content)
    # 3. Tool responds
    # 4. Assistant should provide final answer
    completion = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=500,
        messages=[
            {
                "role": "system",
                "content": "You are a weather assistant. Use the get_weather tool to fetch weather information.",
            },
            {"role": "user", "content": "What's the weather in Seattle?"},
            {
                "role": "assistant",
                "content": None,  # This is the key test - null content with tool_calls
                "tool_calls": [
                    {
                        "id": "call_test123",
                        "type": "function",
                        "function": {
                            "name": "get_weather",
                            "arguments": '{"city": "Seattle"}',
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "call_test123",
                "content": '{"location": "Seattle", "temperature": "10°C", "condition": "Partly cloudy"}',
            },
        ],
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "description": "Get weather information for a city",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "city": {"type": "string", "description": "City name"}
                        },
                        "required": ["city"],
                    },
                },
            }
        ],
    )

    response_content = completion.choices[0].message.content
    logger.info(f"Response after tool call: {response_content}")

    # The assistant should provide a final response using the tool result
    assert response_content is not None
    assert len(response_content) > 0
    logger.info(
        "✓ Assistant message with null content and tool_calls handled correctly"
    )


def test_openai_client_with_alias_arch_summarize_v1():
    """Test OpenAI client using model alias 'arch.summarize.v1' which should resolve to '4o-mini'"""
    logger.info("Testing OpenAI client with alias 'arch.summarize.v1' -> '4o-mini'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    completion = client.chat.completions.create(
        model="arch.summarize.v1",  # This should resolve to 5o-mini
        max_completion_tokens=500,  # Increased token limit to avoid truncation and because the 5o-mini uses reasoning tokens
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1!",
            }
        ],
    )

    response_content = completion.choices[0].message.content
    logger.info(f"Response from arch.summarize.v1 alias: {response_content}")
    assert response_content == "Hello from alias arch.summarize.v1!"


def test_openai_client_with_alias_arch_v1():
    """Test OpenAI client using model alias 'arch.v1' which should resolve to 'o3'"""
    logger.info("Testing OpenAI client with alias 'arch.v1' -> 'o3'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    completion = client.chat.completions.create(
        model="arch.v1",  # This should resolve to gpt-o3
        max_completion_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from alias arch.v1!",
            }
        ],
    )

    response_content = completion.choices[0].message.content
    logger.info(f"Response from arch.v1 alias: {response_content}")
    assert response_content == "Hello from alias arch.v1!"


def test_anthropic_client_with_alias_arch_summarize_v1():
    """Test Anthropic client using model alias 'arch.summarize.v1' which should resolve to '4o-mini'"""
    logger.info("Testing Anthropic client with alias 'arch.summarize.v1' -> '4o-mini'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    message = client.messages.create(
        model="arch.summarize.v1",  # This should resolve to 5o-mini
        max_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from alias arch.summarize.v1 via Anthropic!",
            }
        ],
    )

    response_content = "".join(b.text for b in message.content if b.type == "text")
    logger.info(
        f"Response from arch.summarize.v1 alias via Anthropic: {response_content}"
    )
    assert response_content == "Hello from alias arch.summarize.v1 via Anthropic!"


def test_anthropic_client_with_alias_arch_v1():
    """Test Anthropic client using model alias 'arch.v1' which should resolve to 'o3'"""
    logger.info("Testing Anthropic client with alias 'arch.v1' -> 'o3'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    message = client.messages.create(
        model="arch.v1",  # This should resolve to o3
        max_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from alias arch.v1 via Anthropic!",
            }
        ],
    )

    response_content = "".join(b.text for b in message.content if b.type == "text")
    logger.info(f"Response from arch.v1 alias via Anthropic: {response_content}")
    assert response_content == "Hello from alias arch.v1 via Anthropic!"


def test_openai_client_with_alias_streaming():
    """Test OpenAI client using model alias with streaming"""
    logger.info(
        "Testing OpenAI client with alias 'arch.summarize.v1' streaming -> '4o-mini'"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    stream = client.chat.completions.create(
        model="arch.summarize.v1",  # This should resolve to 5o-mini
        max_completion_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from streaming alias!",
            }
        ],
        stream=True,
    )

    content_chunks = []
    for chunk in stream:
        if chunk.choices[0].delta.content:
            content_chunks.append(chunk.choices[0].delta.content)

    full_content = "".join(content_chunks)
    logger.info(f"Streaming response from arch.summarize.v1 alias: {full_content}")
    assert full_content == "Hello from streaming alias!"


def test_anthropic_client_with_alias_streaming():
    """Test Anthropic client using model alias with streaming"""
    logger.info(
        "Testing Anthropic client with alias 'arch.summarize.v1' streaming -> '4o-mini'"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    with client.messages.stream(
        model="arch.summarize.v1",  # This should resolve to 5o-mini
        max_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from streaming alias via Anthropic!",
            }
        ],
    ) as stream:
        pieces = [t for t in stream.text_stream]
        full_text = "".join(pieces)

    logger.info(
        f"Streaming response from arch.summarize.v1 alias via Anthropic: {full_text}"
    )
    assert full_text == "Hello from streaming alias via Anthropic!"


def test_400_error_handling_with_alias():
    """Test that 400 errors from upstream are properly returned by plano"""
    logger.info(
        "Testing 400 error handling with arch.summarize.v1 and invalid parameter"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    try:
        completion = client.chat.completions.create(
            model="arch.summarize.v1",  # This should resolve to gpt-5-mini-2025-08-07
            max_tokens=50,
            messages=[
                {
                    "role": "user",
                    "content": "Hello, this should trigger a 400 error due to invalid parameter name",
                }
            ],
        )
        # If we reach here, the request unexpectedly succeeded
        logger.error(
            f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
        )
        assert False, "Expected 400 error but request succeeded"
    except openai.BadRequestError as e:
        # This is what we expect - a 400 Bad Request error
        logger.info(f"Correctly received 400 Bad Request error: {e}")
        assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
        logger.info("✓ 400 error handling working correctly")
    except Exception as e:
        # Any other exception is unexpected
        logger.error(
            f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
        )
        assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"


def test_400_error_handling_unsupported_parameter():
    """Test that 400 errors for unsupported parameters are properly returned by archgw"""
    logger.info("Testing 400 error handling with unsupported max_tokens parameter")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    try:
        # Use the deprecated max_tokens parameter which should trigger a 400 error
        completion = client.chat.completions.create(
            model="arch.summarize.v1",  # This should resolve to gpt-5-mini-2025-08-07
            max_tokens=150,  # This parameter is unsupported for newer models, should use max_completion_tokens
            messages=[
                {
                    "role": "user",
                    "content": "Hello, this should trigger a 400 error due to unsupported max_tokens parameter",
                }
            ],
        )
        # If we reach here, the request unexpectedly succeeded
        logger.error(
            f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
        )
        assert False, "Expected 400 error but request succeeded"
    except openai.BadRequestError as e:
        # This is what we expect - a 400 Bad Request error
        logger.info(f"Correctly received 400 Bad Request error: {e}")
        assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
        assert "max_tokens" in str(e), "Expected error message to mention max_tokens"
        logger.info("✓ 400 error handling for unsupported parameters working correctly")
    except Exception as e:
        # Any other exception is unexpected
        logger.error(
            f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
        )
        assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"


def test_nonexistent_alias():
    """Test that using a non-existent alias falls back to treating it as a direct model name"""
    logger.info(
        "Testing non-existent alias 'nonexistent.alias' should be treated as direct model"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    try:
        completion = client.chat.completions.create(
            model="nonexistent.alias",  # This alias doesn't exist
            max_completion_tokens=50,
            messages=[
                {
                    "role": "user",
                    "content": "Hello, this should fail or use as direct model name",
                }
            ],
        )
        logger.info("Non-existent alias was handled gracefully")
        # If it succeeds, it means the alias was passed through as a direct model name
        logger.info(f"Response: {completion.choices[0].message.content}")
    except Exception as e:
        logger.info(f"Non-existent alias resulted in error (expected): {e}")
        # This is also acceptable behavior


# =============================================================================
# DIRECT MODEL TESTS (for comparison)
# =============================================================================


def test_direct_model_4o_mini_openai():
    """Test OpenAI client using direct model name '4o-mini'"""
    logger.info("Testing OpenAI client with direct model '4o-mini'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # Direct model name
        max_completion_tokens=50,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from direct 4o-mini!",
            }
        ],
    )

    response_content = completion.choices[0].message.content
    logger.info(f"Response from direct 4o-mini: {response_content}")
    assert response_content == "Hello from direct 4o-mini!"


def test_direct_model_4o_mini_anthropic():
    """Test Anthropic client using direct model name '4o-mini'"""
    logger.info("Testing Anthropic client with direct model '4o-mini'")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    message = client.messages.create(
        model="gpt-4o-mini",  # Direct model name
        max_tokens=50,
        messages=[
            {
                "role": "user",
                "content": "Hello, please respond with exactly: Hello from direct 4o-mini via Anthropic!",
            }
        ],
    )

    response_content = "".join(b.text for b in message.content if b.type == "text")
    logger.info(f"Response from direct 4o-mini via Anthropic: {response_content}")
    assert response_content == "Hello from direct 4o-mini via Anthropic!"


def test_anthropic_thinking_mode_streaming():
    # Anthropic base_url should be the root, not /v1/chat/completions
    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")

    client = anthropic.Anthropic(
        api_key=os.environ.get("ANTHROPIC_API_KEY", "test-key"),
        base_url=base_url,
    )

    thinking_block_started = False
    thinking_delta_seen = False
    text_delta_seen = False

    with client.messages.stream(
        model="claude-sonnet-4-20250514",
        max_tokens=2048,
        thinking={"type": "enabled", "budget_tokens": 1024},  # <- idiomatic
        messages=[{"role": "user", "content": "Explain briefly what 2+2 equals"}],
    ) as stream:
        for event in stream:
            # 1) detect when a thinking block starts
            if event.type == "content_block_start" and getattr(
                event, "content_block", None
            ):
                if getattr(event.content_block, "type", None) == "thinking":
                    thinking_block_started = True

            # 2) collect text vs thinking deltas
            if event.type == "content_block_delta" and getattr(event, "delta", None):
                if event.delta.type == "text_delta":
                    text_delta_seen = True
                elif event.delta.type == "thinking_delta":
                    # some SDKs expose .thinking, others .text for this delta; not needed here
                    thinking_delta_seen = True

        final = stream.get_final_message()

    # Basic integrity
    assert final is not None
    assert final.content and len(final.content) > 0

    # Normal text should have streamed
    assert text_delta_seen, "Expected normal text deltas in stream"

    # With thinking enabled, we expect a thinking block and at least one thinking delta
    assert thinking_block_started, "No thinking block started"
    assert thinking_delta_seen, "No thinking deltas observed"

    # Optional: double-check on the assembled message
    final_block_types = [blk.type for blk in final.content]
    assert "text" in final_block_types
    assert "thinking" in final_block_types


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_openai_client_with_coding_model_alias_and_tools():
    """Test OpenAI client using 'coding-model' alias (maps to Bedrock) with coding question and tools"""
    logger.info("Testing OpenAI client with 'coding-model' alias -> Bedrock with tools")

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    completion = client.chat.completions.create(
        model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
        max_tokens=1000,
        messages=[
            {
                "role": "user",
                "content": "I need to write a Python function that calculates the factorial of a number. Can you help me write and run it?",
            }
        ],
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "run_python_code",
                    "description": "Execute Python code and return the result",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "code": {
                                "type": "string",
                                "description": "Python code to execute",
                            }
                        },
                        "required": ["code"],
                    },
                },
            }
        ],
        tool_choice="auto",
    )

    response_content = completion.choices[0].message.content
    tool_calls = completion.choices[0].message.tool_calls
    # Should get either text response or tool calls for coding assistance
    assert response_content is not None or (
        tool_calls is not None and len(tool_calls) > 0
    )


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_anthropic_client_with_coding_model_alias_and_tools():
    """Test Anthropic client using 'coding-model' alias (maps to Bedrock) with coding question and tools"""
    logger.info(
        "Testing Anthropic client with 'coding-model' alias -> Bedrock with tools"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    message = client.messages.create(
        model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
        max_tokens=1000,
        messages=[
            {
                "role": "user",
                "content": "I need to write a Python function that calculates the factorial of a number. Can you help me write and run it?",
            }
        ],
        tools=[
            {
                "name": "run_python_code",
                "description": "Execute Python code and return the result",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "code": {
                            "type": "string",
                            "description": "Python code to execute",
                        }
                    },
                    "required": ["code"],
                },
            }
        ],
        tool_choice={"type": "auto"},
    )

    text_content = "".join(b.text for b in message.content if b.type == "text")
    tool_use_blocks = [b for b in message.content if b.type == "tool_use"]

    logger.info(f"Response from coding-model alias via Anthropic: {text_content}")
    logger.info(f"Tool use blocks: {len(tool_use_blocks)}")

    # Should get either text response or tool use blocks for coding assistance
    assert text_content or len(tool_use_blocks) > 0


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_anthropic_client_with_coding_model_alias_and_tools_streaming():
    """Test Anthropic client using 'coding-model' alias (maps to Bedrock) with coding question and tools - streaming"""
    logger.info(
        "Testing Anthropic client with 'coding-model' alias -> Bedrock with tools (streaming)"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    text_chunks = []
    tool_use_blocks = []
    all_events = []  # Capture all events for debugging

    try:
        with client.messages.stream(
            model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
            max_tokens=1000,
            messages=[
                {
                    "role": "user",
                    "content": "I need to write a Python function that calculates the factorial of a number. Can you help me write and run it?",
                }
            ],
            tools=[
                {
                    "name": "run_python_code",
                    "description": "Execute Python code and return the result",
                    "input_schema": {
                        "type": "object",
                        "properties": {
                            "code": {
                                "type": "string",
                                "description": "Python code to execute",
                            }
                        },
                        "required": ["code"],
                    },
                }
            ],
            tool_choice={"type": "auto"},
        ) as stream:
            for event in stream:
                # Extract index if available
                index = getattr(event, "index", None)

                # Log and capture all events for debugging
                all_events.append(
                    {"type": event.type, "index": index, "event": str(event)[:200]}
                )
                logger.info(f"Event #{len(all_events)}: {event.type} [index={index}]")

                # Collect text deltas
                if event.type == "content_block_delta" and hasattr(event, "delta"):
                    if event.delta.type == "text_delta":
                        text_chunks.append(event.delta.text)

                # Collect tool use blocks
                if event.type == "content_block_start" and hasattr(
                    event, "content_block"
                ):
                    if event.content_block.type == "tool_use":
                        tool_use_blocks.append(event.content_block)

            final_message = stream.get_final_message()
    except Exception as e:
        logger.error(f"Exception during streaming: {type(e).__name__}: {e}")
        logger.error(f"Events received before error: {len(all_events)}")
        logger.error(f"Text chunks collected: {len(text_chunks)}")
        logger.error(f"Tool use blocks collected: {len(tool_use_blocks)}")
        logger.error("\nLast 20 events before crash:")
        for evt in all_events[-20:]:
            logger.error(f"  {evt['type']:30s} index={evt['index']}")
        raise

    full_text = "".join(text_chunks)
    logger.info(f"Streaming response from coding-model with tools: {full_text}")
    logger.info(f"Total events received: {len(all_events)}")
    logger.info(
        f"Text chunks: {len(text_chunks)}, Tool use blocks: {len(tool_use_blocks)}"
    )

    # Should get either text response or tool use blocks for coding assistance
    # Modified assertion to be more lenient and provide better error messages
    assert (
        full_text or len(tool_use_blocks) > 0
    ), f"Expected text or tool use. Got text_len={len(full_text)}, tools={len(tool_use_blocks)}, events={len(all_events)}"

    # Verify final message structure
    assert final_message is not None, "Final message should not be None"
    assert (
        final_message.content and len(final_message.content) > 0
    ), f"Final message should have content. Got: {final_message.content if final_message else 'None'}"


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_anthropic_client_streaming_with_bedrock():
    """Test Anthropic client using 'coding-model' alias (maps to Bedrock) with streaming"""
    logger.info(
        "Testing Anthropic client with 'coding-model' alias -> Bedrock (streaming)"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = anthropic.Anthropic(api_key="test-key", base_url=base_url)

    text_chunks = []

    with client.messages.stream(
        model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
        max_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Write a short 4-line sonnet about coding.",
            }
        ],
    ) as stream:
        for event in stream:
            # Collect text deltas
            if event.type == "content_block_delta" and hasattr(event, "delta"):
                if event.delta.type == "text_delta":
                    text_chunks.append(event.delta.text)

        final_message = stream.get_final_message()

    full_text = "".join(text_chunks)
    logger.info(f"Response: {full_text}")

    # Should get a text response
    assert len(full_text) > 0, "Expected text response from streaming"

    # Verify final message structure
    assert final_message is not None
    assert final_message.content and len(final_message.content) > 0


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_openai_client_streaming_with_bedrock():
    """Test OpenAI client using 'coding-model' alias (maps to Bedrock) with streaming"""
    logger.info(
        "Testing OpenAI client with 'coding-model' alias -> Bedrock (streaming)"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    stream = client.chat.completions.create(
        model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
        max_tokens=500,
        messages=[
            {
                "role": "user",
                "content": "Write a short 4-line sonnet about coding.",
            }
        ],
        stream=True,
    )

    content_chunks = []
    for chunk in stream:
        if chunk.choices and len(chunk.choices) > 0:
            delta = chunk.choices[0].delta
            if delta.content:
                content_chunks.append(delta.content)

    full_content = "".join(content_chunks)
    logger.info(f"Streaming response from coding-model: {full_content}")

    # Should get a text response
    assert len(full_content) > 0, "Expected text response from streaming"


@pytest.mark.skip("unreliable - bedrock tests are flaky in CI")
def test_openai_client_streaming_with_bedrock_and_tools():
    """Test OpenAI client using 'coding-model' alias (maps to Bedrock) with streaming and tools"""
    logger.info(
        "Testing OpenAI client with 'coding-model' alias -> Bedrock with tools (streaming)"
    )

    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
    client = openai.OpenAI(
        api_key="test-key",
        base_url=f"{base_url}/v1",
    )

    stream = client.chat.completions.create(
        model="coding-model",  # This should resolve to us.amazon.nova-premier-v1:0
        max_tokens=1000,
        messages=[
            {
                "role": "user",
                "content": "I need to write a Python function that calculates the factorial of a number. Can you help me write and run it?. You should use the tool to run the code.",
            }
        ],
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "run_python_code",
                    "description": "Execute Python code and return the result",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "code": {
                                "type": "string",
                                "description": "Python code to execute",
                            }
                        },
                        "required": ["code"],
                    },
                },
            }
        ],
        tool_choice="auto",
        stream=True,
    )

    content_chunks = []
    tool_calls = []
    chunk_count = 0

    for chunk in stream:
        chunk_count += 1
        if chunk.choices and len(chunk.choices) > 0:
            delta = chunk.choices[0].delta

            # Log what we see in each chunk
            has_content = delta.content is not None
            has_tool_calls = delta.tool_calls is not None

            if (
                chunk_count % 50 == 0 or has_tool_calls
            ):  # Log every 50th chunk or any chunk with tool calls
                logger.info(
                    f"Chunk {chunk_count}: content={has_content}, tool_calls={has_tool_calls}"
                )
                if has_tool_calls:
                    logger.info(f"  Tool calls in chunk: {delta.tool_calls}")

            # Collect text content
            if delta.content:
                content_chunks.append(delta.content)

            # Collect tool calls
            if delta.tool_calls:
                for tool_call in delta.tool_calls:
                    # Extend or create tool call entries
                    while len(tool_calls) <= tool_call.index:
                        tool_calls.append(
                            {
                                "id": "",
                                "type": "function",
                                "function": {"name": "", "arguments": ""},
                            }
                        )

                    if tool_call.id:
                        tool_calls[tool_call.index]["id"] = tool_call.id
                    if tool_call.function:
                        if tool_call.function.name:
                            tool_calls[tool_call.index]["function"][
                                "name"
                            ] = tool_call.function.name
                        if tool_call.function.arguments:
                            tool_calls[tool_call.index]["function"][
                                "arguments"
                            ] += tool_call.function.arguments

    full_content = "".join(content_chunks)
    logger.info(f"Streaming response from coding-model with tools: {full_content}")
    logger.info(f"Tool calls collected: {len(tool_calls)}")

    if tool_calls:
        for i, tc in enumerate(tool_calls):
            logger.info(f"  Tool call {i}: {tc['function']['name']}")

    # Should get either text response or tool calls for coding assistance
    assert (
        full_content or len(tool_calls) > 0
    ), f"Expected text or tool calls. Got text_len={len(full_content)}, tools={len(tool_calls)}"