import openai import pytest import os import logging import sys # Set up logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger(__name__) LLM_GATEWAY_ENDPOINT = os.getenv( "LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions" ) # ----------------------- # v1/responses API tests # ----------------------- def test_openai_responses_api_non_streaming_passthrough(): """Build a v1/responses API request (pass-through) and ensure gateway accepts it""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # Simple responses API request using a direct model (pass-through) resp = client.responses.create( model="gpt-4o", input="Hello via responses passthrough" ) # Print the response content - handle both responses format and chat completions format print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") # Minimal sanity checks assert resp is not None assert ( getattr(resp, "id", None) is not None or getattr(resp, "output", None) is not None ) def test_openai_responses_api_with_streaming_passthrough(): """Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # Simple streaming responses API request using a direct model (pass-through) stream = client.responses.create( model="gpt-4o", input="Write a short haiku about coding", stream=True, ) # Collect streamed content using the official Responses API streaming shape text_chunks = [] final_message = None for event in stream: # The Python SDK surfaces a high-level Responses streaming interface. # We rely on its typed helpers instead of digging into model_extra. if getattr(event, "type", None) == "response.output_text.delta" and getattr( event, "delta", None ): # Each delta contains a text fragment text_chunks.append(event.delta) # Track the final response message if provided by the SDK if getattr(event, "type", None) == "response.completed" and getattr( event, "response", None ): final_message = event.response full_content = "".join(text_chunks) # Print the streaming response print(f"\n{'='*80}") print( f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}" ) print(f"Streamed Output: {full_content}") print(f"{'='*80}\n") assert len(text_chunks) > 0, "Should have received streaming text deltas" assert len(full_content) > 0, "Should have received content" def test_openai_responses_api_non_streaming_with_tools_passthrough(): """Responses API with a function/tool definition (pass-through)""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0) # Define a simple tool/function for the Responses API tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] resp = client.responses.create( model="openai/gpt-5-mini-2025-08-07", input="Call the echo tool", tools=tools, ) assert resp is not None assert ( getattr(resp, "id", None) is not None or getattr(resp, "output", None) is not None ) def test_openai_responses_api_with_streaming_with_tools_passthrough(): """Responses API with a function/tool definition (streaming, pass-through)""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0) tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] stream = client.responses.create( model="openai/gpt-5-mini-2025-08-07", input="Call the echo tool", tools=tools, stream=True, ) text_chunks = [] tool_calls = [] for event in stream: etype = getattr(event, "type", None) # Collect streamed text output if etype == "response.output_text.delta" and getattr(event, "delta", None): text_chunks.append(event.delta) # Collect streamed tool call arguments if etype == "response.function_call_arguments.delta" and getattr( event, "delta", None ): tool_calls.append(event.delta) full_text = "".join(text_chunks) print(f"\n{'='*80}") print("Responses tools streaming test") print(f"Streamed text: {full_text}") print(f"Tool call argument chunks: {len(tool_calls)}") print(f"{'='*80}\n") # We expect either streamed text output or streamed tool-call arguments assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" def test_openai_responses_api_non_streaming_upstream_chat_completions(): """Send a v1/responses request using the grok alias to verify translation/routing""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") resp = client.responses.create( model="arch.grok.v1", input="Hello, translate this via grok alias" ) # Print the response content - handle both responses format and chat completions format print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") assert resp is not None assert resp.id is not None def test_openai_responses_api_with_streaming_upstream_chat_completions(): """Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # Simple streaming responses API request using a direct model (pass-through) stream = client.responses.create( model="arch.grok.v1", input="Write a short haiku about coding", stream=True, ) # Collect streamed content using the official Responses API streaming shape text_chunks = [] final_message = None for event in stream: # The Python SDK surfaces a high-level Responses streaming interface. # We rely on its typed helpers instead of digging into model_extra. if getattr(event, "type", None) == "response.output_text.delta" and getattr( event, "delta", None ): # Each delta contains a text fragment text_chunks.append(event.delta) # Track the final response message if provided by the SDK if getattr(event, "type", None) == "response.completed" and getattr( event, "response", None ): final_message = event.response full_content = "".join(text_chunks) # Print the streaming response print(f"\n{'='*80}") print( f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}" ) print(f"Streamed Output: {full_content}") print(f"{'='*80}\n") assert len(text_chunks) > 0, "Should have received streaming text deltas" assert len(full_content) > 0, "Should have received content" def test_openai_responses_api_non_streaming_with_tools_upstream_chat_completions(): """Responses API wioutputling routed to grok via alias""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] resp = client.responses.create( model="arch.grok.v1", input="Call the echo tool", tools=tools, ) assert resp.id is not None print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") def test_openai_responses_api_streaming_with_tools_upstream_chat_completions(): """Responses API with a function/tool definition (streaming, pass-through)""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0) tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] stream = client.responses.create( model="arch.grok.v1", input="Call the echo tool", tools=tools, stream=True, ) text_chunks = [] tool_calls = [] for event in stream: etype = getattr(event, "type", None) # Collect streamed text output if etype == "response.output_text.delta" and getattr(event, "delta", None): text_chunks.append(event.delta) # Collect streamed tool call arguments if etype == "response.function_call_arguments.delta" and getattr( event, "delta", None ): tool_calls.append(event.delta) full_text = "".join(text_chunks) print(f"\n{'='*80}") print("Responses tools streaming test") print(f"Streamed text: {full_text}") print(f"Tool call argument chunks: {len(tool_calls)}") print(f"{'='*80}\n") # We expect either streamed text output or streamed tool-call arguments assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" @pytest.mark.skip("unreliable - bedrock tests are flaky in CI") def test_openai_responses_api_non_streaming_upstream_bedrock(): """Send a v1/responses request using the coding-model alias to verify Bedrock translation/routing""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") resp = client.responses.create( model="coding-model", input="Hello, translate this via coding-model alias to Bedrock", ) # Print the response content - handle both responses format and chat completions format print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") assert resp is not None assert resp.id is not None @pytest.mark.skip("unreliable - bedrock tests are flaky in CI") def test_openai_responses_api_with_streaming_upstream_bedrock(): """Build a v1/responses API streaming request routed to Bedrock via coding-model alias""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # Simple streaming responses API request using coding-model alias stream = client.responses.create( model="coding-model", input="Write a short haiku about coding", stream=True, ) # Collect streamed content using the official Responses API streaming shape text_chunks = [] final_message = None for event in stream: # The Python SDK surfaces a high-level Responses streaming interface. # We rely on its typed helpers instead of digging into model_extra. if getattr(event, "type", None) == "response.output_text.delta" and getattr( event, "delta", None ): # Each delta contains a text fragment text_chunks.append(event.delta) # Track the final response message if provided by the SDK if getattr(event, "type", None) == "response.completed" and getattr( event, "response", None ): final_message = event.response full_content = "".join(text_chunks) # Print the streaming response print(f"\n{'='*80}") print( f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}" ) print(f"Streamed Output: {full_content}") print(f"{'='*80}\n") assert len(text_chunks) > 0, "Should have received streaming text deltas" assert len(full_content) > 0, "Should have received content" @pytest.mark.skip("unreliable - bedrock tests are flaky in CI") def test_openai_responses_api_non_streaming_with_tools_upstream_bedrock(): """Responses API with tools routed to Bedrock via coding-model alias""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] resp = client.responses.create( model="coding-model", input="Call the echo tool", tools=tools, ) assert resp.id is not None print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") @pytest.mark.skip("unreliable - bedrock tests are flaky in CI") def test_openai_responses_api_streaming_with_tools_upstream_bedrock(): """Responses API with a function/tool definition streaming to Bedrock via coding-model alias""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0) tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] stream = client.responses.create( model="coding-model", input="Call the echo tool", tools=tools, stream=True, ) text_chunks = [] tool_calls = [] for event in stream: etype = getattr(event, "type", None) # Collect streamed text output if etype == "response.output_text.delta" and getattr(event, "delta", None): text_chunks.append(event.delta) # Collect streamed tool call arguments if etype == "response.function_call_arguments.delta" and getattr( event, "delta", None ): tool_calls.append(event.delta) full_text = "".join(text_chunks) print(f"\n{'='*80}") print("Responses tools streaming test (Bedrock)") print(f"Streamed text: {full_text}") print(f"Tool call argument chunks: {len(tool_calls)}") print(f"{'='*80}\n") # We expect either streamed text output or streamed tool-call arguments assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" def test_openai_responses_api_non_streaming_upstream_anthropic(): """Send a v1/responses request using the grok alias to verify translation/routing""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") resp = client.responses.create( model="claude-sonnet-4-20250514", input="Hello, translate this via grok alias" ) # Print the response content - handle both responses format and chat completions format print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") assert resp is not None assert resp.id is not None def test_openai_responses_api_with_streaming_upstream_anthropic(): """Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # Simple streaming responses API request using a direct model (pass-through) stream = client.responses.create( model="claude-sonnet-4-20250514", input="Write a short haiku about coding", stream=True, ) # Collect streamed content using the official Responses API streaming shape text_chunks = [] final_message = None for event in stream: # The Python SDK surfaces a high-level Responses streaming interface. # We rely on its typed helpers instead of digging into model_extra. if getattr(event, "type", None) == "response.output_text.delta" and getattr( event, "delta", None ): # Each delta contains a text fragment text_chunks.append(event.delta) # Track the final response message if provided by the SDK if getattr(event, "type", None) == "response.completed" and getattr( event, "response", None ): final_message = event.response full_content = "".join(text_chunks) # Print the streaming response print(f"\n{'='*80}") print( f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}" ) print(f"Streamed Output: {full_content}") print(f"{'='*80}\n") assert len(text_chunks) > 0, "Should have received streaming text deltas" assert len(full_content) > 0, "Should have received content" def test_openai_responses_api_non_streaming_with_tools_upstream_anthropic(): """Responses API with tools routed to grok via alias""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input: hello_world", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] resp = client.responses.create( model="claude-sonnet-4-20250514", input="Call the echo tool", tools=tools, ) assert resp.id is not None print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") def test_openai_responses_api_streaming_with_tools_upstream_anthropic(): """Responses API with a function/tool definition (streaming, pass-through)""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0) tools = [ { "type": "function", "name": "echo_tool", "description": "Echo back the provided input: hello_world", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], }, } ] stream = client.responses.create( model="claude-sonnet-4-20250514", input="Call the echo tool with hello_world", tools=tools, stream=True, ) text_chunks = [] tool_calls = [] for event in stream: etype = getattr(event, "type", None) # Collect streamed text output if etype == "response.output_text.delta" and getattr(event, "delta", None): text_chunks.append(event.delta) # Collect streamed tool call arguments if etype == "response.function_call_arguments.delta" and getattr( event, "delta", None ): tool_calls.append(event.delta) full_text = "".join(text_chunks) print(f"\n{'='*80}") print("Responses tools streaming test") print(f"Streamed text: {full_text}") print(f"Tool call argument chunks: {len(tool_calls)}") print(f"{'='*80}\n") # We expect either streamed text output or streamed tool-call arguments assert ( full_text or tool_calls ), "Expected streamed text or tool call argument deltas from Responses tools stream" def test_openai_responses_api_mixed_content_types(): """Test Responses API with mixed content types (string and array) in input messages""" base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1") # This test mimics the request that was failing: # One message with string content, another with array content resp = client.responses.create( model="openai/gpt-5-mini-2025-08-07", input=[ { "role": "developer", "content": "Generate a very short chat title (2-5 words max) based on the user's message.\n" "Rules:\n" "- Maximum 30 characters\n" "- No quotes, colons, hashtags, or markdown\n" "- Just the topic/intent, not a full sentence\n" '- If the message is a greeting like "hi" or "hello", respond with just "New conversation"\n' '- Be concise: "Weather in NYC" not "User asking about the weather in New York City"', }, { "role": "user", "content": [ {"type": "input_text", "text": "What is the weather in Seattle"} ], }, ], ) # Print the response print(f"\n{'='*80}") print(f"Model: {resp.model}") print(f"Output: {resp.output_text}") print(f"{'='*80}\n") assert resp is not None assert resp.id is not None # Verify we got a reasonable title assert len(resp.output_text) > 0