plano/tests/e2e/test_openai_responses_api_client.py

831 lines
29 KiB
Python

import openai
import pytest
import os
import logging
import sys
# Set up logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
LLM_GATEWAY_ENDPOINT = os.getenv(
"LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
)
# -----------------------
# v1/responses API tests
# -----------------------
def test_openai_responses_api_non_streaming_passthrough():
"""Build a v1/responses API request (pass-through) and ensure gateway accepts it"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
# Simple responses API request using a direct model (pass-through)
resp = client.responses.create(
model="gpt-4o", input="Hello via responses passthrough"
)
# Print the response content - handle both responses format and chat completions format
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
# Minimal sanity checks
assert resp is not None
assert (
getattr(resp, "id", None) is not None
or getattr(resp, "output", None) is not None
)
def test_openai_responses_api_with_streaming_passthrough():
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
# Simple streaming responses API request using a direct model (pass-through)
stream = client.responses.create(
model="gpt-4o",
input="Write a short haiku about coding",
stream=True,
)
# Collect streamed content using the official Responses API streaming shape
text_chunks = []
final_message = None
for event in stream:
# The Python SDK surfaces a high-level Responses streaming interface.
# We rely on its typed helpers instead of digging into model_extra.
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
# Each delta contains a text fragment
text_chunks.append(event.delta)
# Track the final response message if provided by the SDK
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
final_message = event.response
full_content = "".join(text_chunks)
# Print the streaming response
print(f"\n{'='*80}")
print(
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
)
print(f"Streamed Output: {full_content}")
print(f"{'='*80}\n")
assert len(text_chunks) > 0, "Should have received streaming text deltas"
assert len(full_content) > 0, "Should have received content"
def test_openai_responses_api_non_streaming_with_tools_passthrough():
"""Responses API with a function/tool definition (pass-through)"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
# Define a simple tool/function for the Responses API
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
resp = client.responses.create(
model="gpt-5",
input="Call the echo tool",
tools=tools,
)
assert resp is not None
assert (
getattr(resp, "id", None) is not None
or getattr(resp, "output", None) is not None
)
def test_openai_responses_api_with_streaming_with_tools_passthrough():
"""Responses API with a function/tool definition (streaming, pass-through)"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
stream = client.responses.create(
model="gpt-5",
input="Call the echo tool",
tools=tools,
stream=True,
)
text_chunks = []
tool_calls = []
for event in stream:
etype = getattr(event, "type", None)
# Collect streamed text output
if etype == "response.output_text.delta" and getattr(event, "delta", None):
text_chunks.append(event.delta)
# Collect streamed tool call arguments
if etype == "response.function_call_arguments.delta" and getattr(
event, "delta", None
):
tool_calls.append(event.delta)
full_text = "".join(text_chunks)
print(f"\n{'='*80}")
print("Responses tools streaming test")
print(f"Streamed text: {full_text}")
print(f"Tool call argument chunks: {len(tool_calls)}")
print(f"{'='*80}\n")
# We expect either streamed text output or streamed tool-call arguments
assert (
full_text or tool_calls
), "Expected streamed text or tool call argument deltas from Responses tools stream"
def test_openai_responses_api_non_streaming_upstream_chat_completions():
"""Send a v1/responses request using the grok alias to verify translation/routing"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
resp = client.responses.create(
model="arch.grok.v1", input="Hello, translate this via grok alias"
)
# Print the response content - handle both responses format and chat completions format
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
assert resp is not None
assert resp.id is not None
def test_openai_responses_api_with_streaming_upstream_chat_completions():
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
# Simple streaming responses API request using a direct model (pass-through)
stream = client.responses.create(
model="arch.grok.v1",
input="Write a short haiku about coding",
stream=True,
)
# Collect streamed content using the official Responses API streaming shape
text_chunks = []
final_message = None
for event in stream:
# The Python SDK surfaces a high-level Responses streaming interface.
# We rely on its typed helpers instead of digging into model_extra.
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
# Each delta contains a text fragment
text_chunks.append(event.delta)
# Track the final response message if provided by the SDK
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
final_message = event.response
full_content = "".join(text_chunks)
# Print the streaming response
print(f"\n{'='*80}")
print(
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
)
print(f"Streamed Output: {full_content}")
print(f"{'='*80}\n")
assert len(text_chunks) > 0, "Should have received streaming text deltas"
assert len(full_content) > 0, "Should have received content"
def test_openai_responses_api_non_streaming_with_tools_upstream_chat_completions():
"""Responses API wioutputling routed to grok via alias"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
resp = client.responses.create(
model="arch.grok.v1",
input="Call the echo tool",
tools=tools,
)
assert resp.id is not None
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
def test_openai_responses_api_streaming_with_tools_upstream_chat_completions():
"""Responses API with a function/tool definition (streaming, pass-through)"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
stream = client.responses.create(
model="arch.grok.v1",
input="Call the echo tool",
tools=tools,
stream=True,
)
text_chunks = []
tool_calls = []
for event in stream:
etype = getattr(event, "type", None)
# Collect streamed text output
if etype == "response.output_text.delta" and getattr(event, "delta", None):
text_chunks.append(event.delta)
# Collect streamed tool call arguments
if etype == "response.function_call_arguments.delta" and getattr(
event, "delta", None
):
tool_calls.append(event.delta)
full_text = "".join(text_chunks)
print(f"\n{'='*80}")
print("Responses tools streaming test")
print(f"Streamed text: {full_text}")
print(f"Tool call argument chunks: {len(tool_calls)}")
print(f"{'='*80}\n")
# We expect either streamed text output or streamed tool-call arguments
assert (
full_text or tool_calls
), "Expected streamed text or tool call argument deltas from Responses tools stream"
def test_openai_responses_api_non_streaming_upstream_bedrock():
"""Send a v1/responses request using the coding-model alias to verify Bedrock translation/routing"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
resp = client.responses.create(
model="coding-model",
input="Hello, translate this via coding-model alias to Bedrock",
)
# Print the response content - handle both responses format and chat completions format
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
assert resp is not None
assert resp.id is not None
def test_openai_responses_api_with_streaming_upstream_bedrock():
"""Build a v1/responses API streaming request routed to Bedrock via coding-model alias"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
# Simple streaming responses API request using coding-model alias
stream = client.responses.create(
model="coding-model",
input="Write a short haiku about coding",
stream=True,
)
# Collect streamed content using the official Responses API streaming shape
text_chunks = []
final_message = None
for event in stream:
# The Python SDK surfaces a high-level Responses streaming interface.
# We rely on its typed helpers instead of digging into model_extra.
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
# Each delta contains a text fragment
text_chunks.append(event.delta)
# Track the final response message if provided by the SDK
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
final_message = event.response
full_content = "".join(text_chunks)
# Print the streaming response
print(f"\n{'='*80}")
print(
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
)
print(f"Streamed Output: {full_content}")
print(f"{'='*80}\n")
assert len(text_chunks) > 0, "Should have received streaming text deltas"
assert len(full_content) > 0, "Should have received content"
def test_openai_responses_api_non_streaming_with_tools_upstream_bedrock():
"""Responses API with tools routed to Bedrock via coding-model alias"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
resp = client.responses.create(
model="coding-model",
input="Call the echo tool",
tools=tools,
)
assert resp.id is not None
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
def test_openai_responses_api_streaming_with_tools_upstream_bedrock():
"""Responses API with a function/tool definition streaming to Bedrock via coding-model alias"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
stream = client.responses.create(
model="coding-model",
input="Call the echo tool",
tools=tools,
stream=True,
)
text_chunks = []
tool_calls = []
for event in stream:
etype = getattr(event, "type", None)
# Collect streamed text output
if etype == "response.output_text.delta" and getattr(event, "delta", None):
text_chunks.append(event.delta)
# Collect streamed tool call arguments
if etype == "response.function_call_arguments.delta" and getattr(
event, "delta", None
):
tool_calls.append(event.delta)
full_text = "".join(text_chunks)
print(f"\n{'='*80}")
print("Responses tools streaming test (Bedrock)")
print(f"Streamed text: {full_text}")
print(f"Tool call argument chunks: {len(tool_calls)}")
print(f"{'='*80}\n")
# We expect either streamed text output or streamed tool-call arguments
assert (
full_text or tool_calls
), "Expected streamed text or tool call argument deltas from Responses tools stream"
def test_openai_responses_api_non_streaming_upstream_anthropic():
"""Send a v1/responses request using the grok alias to verify translation/routing"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
resp = client.responses.create(
model="claude-sonnet-4-20250514", input="Hello, translate this via grok alias"
)
# Print the response content - handle both responses format and chat completions format
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
assert resp is not None
assert resp.id is not None
def test_openai_responses_api_with_streaming_upstream_anthropic():
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
# Simple streaming responses API request using a direct model (pass-through)
stream = client.responses.create(
model="claude-sonnet-4-20250514",
input="Write a short haiku about coding",
stream=True,
)
# Collect streamed content using the official Responses API streaming shape
text_chunks = []
final_message = None
for event in stream:
# The Python SDK surfaces a high-level Responses streaming interface.
# We rely on its typed helpers instead of digging into model_extra.
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
# Each delta contains a text fragment
text_chunks.append(event.delta)
# Track the final response message if provided by the SDK
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
final_message = event.response
full_content = "".join(text_chunks)
# Print the streaming response
print(f"\n{'='*80}")
print(
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
)
print(f"Streamed Output: {full_content}")
print(f"{'='*80}\n")
assert len(text_chunks) > 0, "Should have received streaming text deltas"
assert len(full_content) > 0, "Should have received content"
def test_openai_responses_api_non_streaming_with_tools_upstream_anthropic():
"""Responses API with tools routed to grok via alias"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input: hello_world",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
resp = client.responses.create(
model="claude-sonnet-4-20250514",
input="Call the echo tool",
tools=tools,
)
assert resp.id is not None
print(f"\n{'='*80}")
print(f"Model: {resp.model}")
print(f"Output: {resp.output_text}")
print(f"{'='*80}\n")
def test_openai_responses_api_streaming_with_tools_upstream_anthropic():
"""Responses API with a function/tool definition (streaming, pass-through)"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
tools = [
{
"type": "function",
"name": "echo_tool",
"description": "Echo back the provided input: hello_world",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
},
}
]
stream = client.responses.create(
model="claude-sonnet-4-20250514",
input="Call the echo tool",
tools=tools,
stream=True,
)
text_chunks = []
tool_calls = []
for event in stream:
etype = getattr(event, "type", None)
# Collect streamed text output
if etype == "response.output_text.delta" and getattr(event, "delta", None):
text_chunks.append(event.delta)
# Collect streamed tool call arguments
if etype == "response.function_call_arguments.delta" and getattr(
event, "delta", None
):
tool_calls.append(event.delta)
full_text = "".join(text_chunks)
print(f"\n{'='*80}")
print("Responses tools streaming test")
print(f"Streamed text: {full_text}")
print(f"Tool call argument chunks: {len(tool_calls)}")
print(f"{'='*80}\n")
# We expect either streamed text output or streamed tool-call arguments
assert (
full_text or tool_calls
), "Expected streamed text or tool call argument deltas from Responses tools stream"
def test_conversation_state_management_two_turn():
"""
Test conversation state management across two turns:
1. Send initial message to non-OpenAI model via v1/responses
2. Capture response_id from first response
3. Send second message with previous_response_id
4. Verify model receives both messages in correct order
"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
logger.info("\n" + "=" * 80)
logger.info("TEST: Conversation State Management - Two Turn Flow")
logger.info("=" * 80)
# Turn 1: Send initial message to Anthropic (non-OpenAI model)
logger.info("\n[TURN 1] Sending initial message...")
resp1 = client.responses.create(
model="claude-sonnet-4-20250514",
input="My name is Alice and I like pizza.",
)
# Extract response_id from first response
response_id_1 = resp1.id
logger.info(f"[TURN 1] Received response_id: {response_id_1}")
logger.info(f"[TURN 1] Model response: {resp1.output_text}")
assert response_id_1 is not None, "First response should have an id"
assert len(resp1.output_text) > 0, "First response should have content"
# Turn 2: Send follow-up message with previous_response_id
# Ask the model to list all messages to verify state was combined
logger.info(
f"\n[TURN 2] Sending follow-up with previous_response_id={response_id_1}"
)
resp2 = client.responses.create(
model="claude-sonnet-4-20250514",
input="Please list all the messages you have received in our conversation, numbering each one.",
previous_response_id=response_id_1,
)
response_id_2 = resp2.id
logger.info(f"[TURN 2] Received response_id: {response_id_2}")
logger.info(f"[TURN 2] Model response: {resp2.output_text}")
assert response_id_2 is not None, "Second response should have an id"
assert response_id_2 != response_id_1, "Second response should have different id"
# Verify the model received the conversation history
# The response should reference both the initial message and the follow-up
response_lower = resp2.output_text.lower()
# Check if the model acknowledges receiving multiple messages
# Different models might format this differently, so we check for various indicators
has_conversation_context = (
"alice" in response_lower
or "pizza" in response_lower # References the name from turn 1
or "two" in response_lower # References the preference from turn 1
or "2" in response_lower # Mentions number of messages
or "first" in response_lower # Numeric indicator
or "second" # References first message
in response_lower # References second message
)
logger.info(
f"\n[VALIDATION] Conversation context preserved: {has_conversation_context}"
)
logger.info(
f"[VALIDATION] Response contains conversation markers: {has_conversation_context}"
)
print(f"\n{'='*80}")
print("Conversation State Test Results:")
print(f"Turn 1 Response ID: {response_id_1}")
print(f"Turn 2 Response ID: {response_id_2}")
print(f"Turn 1 Output: {resp1.output_text[:100]}...")
print(f"Turn 2 Output: {resp2.output_text}")
print(f"Conversation Context Preserved: {has_conversation_context}")
print(f"{'='*80}\n")
assert has_conversation_context, (
f"Model should have received conversation history. "
f"Response: {resp2.output_text}"
)
def test_conversation_state_management_two_turn_streaming():
"""
Test conversation state management across two turns with streaming:
1. Send initial streaming message to non-OpenAI model via v1/responses
2. Capture response_id from first response
3. Send second streaming message with previous_response_id
4. Verify model receives both messages in correct order
"""
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
logger.info("\n" + "=" * 80)
logger.info("TEST: Conversation State Management - Two Turn Streaming Flow")
logger.info("=" * 80)
# Turn 1: Send initial streaming message to Anthropic (non-OpenAI model)
logger.info("\n[TURN 1] Sending initial streaming message...")
stream1 = client.responses.create(
model="claude-sonnet-4-20250514",
input="My name is Alice and I like pizza.",
stream=True,
)
# Collect streamed content and capture response_id
text_chunks_1 = []
response_id_1 = None
for event in stream1:
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
text_chunks_1.append(event.delta)
# Capture response_id from response.completed event
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
response_id_1 = event.response.id
output_1 = "".join(text_chunks_1)
logger.info(f"[TURN 1] Received response_id: {response_id_1}")
logger.info(f"[TURN 1] Model response: {output_1}")
assert response_id_1 is not None, "First response should have an id"
assert len(output_1) > 0, "First response should have content"
# Turn 2: Send follow-up streaming message with previous_response_id
logger.info(
f"\n[TURN 2] Sending follow-up streaming request with previous_response_id={response_id_1}"
)
stream2 = client.responses.create(
model="claude-sonnet-4-20250514",
input="Please list all the messages you have received in our conversation, numbering each one.",
previous_response_id=response_id_1,
stream=True,
)
# Collect streamed content from second response
text_chunks_2 = []
response_id_2 = None
for event in stream2:
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
event, "delta", None
):
text_chunks_2.append(event.delta)
# Capture response_id from response.completed event
if getattr(event, "type", None) == "response.completed" and getattr(
event, "response", None
):
response_id_2 = event.response.id
output_2 = "".join(text_chunks_2)
logger.info(f"[TURN 2] Received response_id: {response_id_2}")
logger.info(f"[TURN 2] Model response: {output_2}")
assert response_id_2 is not None, "Second response should have an id"
assert response_id_2 != response_id_1, "Second response should have different id"
# Verify the model received the conversation history
response_lower = output_2.lower()
# Check if the model acknowledges receiving multiple messages
has_conversation_context = (
"alice" in response_lower
or "pizza" in response_lower # References the name from turn 1
or "two" in response_lower # References the preference from turn 1
or "2" in response_lower # Mentions number of messages
or "first" in response_lower # Numeric indicator
or "second" # References first message
in response_lower # References second message
)
logger.info(
f"\n[VALIDATION] Conversation context preserved: {has_conversation_context}"
)
logger.info(
f"[VALIDATION] Response contains conversation markers: {has_conversation_context}"
)
print(f"\n{'='*80}")
print("Streaming Conversation State Test Results:")
print(f"Turn 1 Response ID: {response_id_1}")
print(f"Turn 2 Response ID: {response_id_2}")
print(f"Turn 1 Output: {output_1[:100]}...")
print(f"Turn 2 Output: {output_2}")
print(f"Conversation Context Preserved: {has_conversation_context}")
print(f"{'='*80}\n")
assert has_conversation_context, (
f"Model should have received conversation history. " f"Response: {output_2}"
)