2025-12-03 14:58:26 -08:00
|
|
|
import openai
|
|
|
|
|
import pytest
|
|
|
|
|
import os
|
|
|
|
|
import logging
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
# Set up logging
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
|
|
|
handlers=[logging.StreamHandler(sys.stdout)],
|
|
|
|
|
)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
LLM_GATEWAY_ENDPOINT = os.getenv(
|
|
|
|
|
"LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -----------------------
|
|
|
|
|
# v1/responses API tests
|
|
|
|
|
# -----------------------
|
|
|
|
|
def test_openai_responses_api_non_streaming_passthrough():
|
|
|
|
|
"""Build a v1/responses API request (pass-through) and ensure gateway accepts it"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
# Simple responses API request using a direct model (pass-through)
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="gpt-4o", input="Hello via responses passthrough"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Print the response content - handle both responses format and chat completions format
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
# Minimal sanity checks
|
|
|
|
|
assert resp is not None
|
|
|
|
|
assert (
|
|
|
|
|
getattr(resp, "id", None) is not None
|
|
|
|
|
or getattr(resp, "output", None) is not None
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_with_streaming_passthrough():
|
|
|
|
|
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
# Simple streaming responses API request using a direct model (pass-through)
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="gpt-4o",
|
|
|
|
|
input="Write a short haiku about coding",
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content using the official Responses API streaming shape
|
|
|
|
|
text_chunks = []
|
|
|
|
|
final_message = None
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
# The Python SDK surfaces a high-level Responses streaming interface.
|
|
|
|
|
# We rely on its typed helpers instead of digging into model_extra.
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
# Each delta contains a text fragment
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Track the final response message if provided by the SDK
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
final_message = event.response
|
|
|
|
|
|
|
|
|
|
full_content = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
# Print the streaming response
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(
|
|
|
|
|
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
|
|
|
|
|
)
|
|
|
|
|
print(f"Streamed Output: {full_content}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert len(text_chunks) > 0, "Should have received streaming text deltas"
|
|
|
|
|
assert len(full_content) > 0, "Should have received content"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_with_tools_passthrough():
|
|
|
|
|
"""Responses API with a function/tool definition (pass-through)"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
|
|
|
|
|
|
|
|
|
|
# Define a simple tool/function for the Responses API
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="gpt-5",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert resp is not None
|
|
|
|
|
assert (
|
|
|
|
|
getattr(resp, "id", None) is not None
|
|
|
|
|
or getattr(resp, "output", None) is not None
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_with_streaming_with_tools_passthrough():
|
|
|
|
|
"""Responses API with a function/tool definition (streaming, pass-through)"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="gpt-5",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
text_chunks = []
|
|
|
|
|
tool_calls = []
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
etype = getattr(event, "type", None)
|
|
|
|
|
|
|
|
|
|
# Collect streamed text output
|
|
|
|
|
if etype == "response.output_text.delta" and getattr(event, "delta", None):
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Collect streamed tool call arguments
|
|
|
|
|
if etype == "response.function_call_arguments.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
tool_calls.append(event.delta)
|
|
|
|
|
|
|
|
|
|
full_text = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Responses tools streaming test")
|
|
|
|
|
print(f"Streamed text: {full_text}")
|
|
|
|
|
print(f"Tool call argument chunks: {len(tool_calls)}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
# We expect either streamed text output or streamed tool-call arguments
|
|
|
|
|
assert (
|
|
|
|
|
full_text or tool_calls
|
|
|
|
|
), "Expected streamed text or tool call argument deltas from Responses tools stream"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_upstream_chat_completions():
|
|
|
|
|
"""Send a v1/responses request using the grok alias to verify translation/routing"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="arch.grok.v1", input="Hello, translate this via grok alias"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Print the response content - handle both responses format and chat completions format
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert resp is not None
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_with_streaming_upstream_chat_completions():
|
|
|
|
|
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
# Simple streaming responses API request using a direct model (pass-through)
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="arch.grok.v1",
|
|
|
|
|
input="Write a short haiku about coding",
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content using the official Responses API streaming shape
|
|
|
|
|
text_chunks = []
|
|
|
|
|
final_message = None
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
# The Python SDK surfaces a high-level Responses streaming interface.
|
|
|
|
|
# We rely on its typed helpers instead of digging into model_extra.
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
# Each delta contains a text fragment
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Track the final response message if provided by the SDK
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
final_message = event.response
|
|
|
|
|
|
|
|
|
|
full_content = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
# Print the streaming response
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(
|
|
|
|
|
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
|
|
|
|
|
)
|
|
|
|
|
print(f"Streamed Output: {full_content}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert len(text_chunks) > 0, "Should have received streaming text deltas"
|
|
|
|
|
assert len(full_content) > 0, "Should have received content"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_with_tools_upstream_chat_completions():
|
|
|
|
|
"""Responses API wioutputling routed to grok via alias"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="arch.grok.v1",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_streaming_with_tools_upstream_chat_completions():
|
|
|
|
|
"""Responses API with a function/tool definition (streaming, pass-through)"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="arch.grok.v1",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
text_chunks = []
|
|
|
|
|
tool_calls = []
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
etype = getattr(event, "type", None)
|
|
|
|
|
|
|
|
|
|
# Collect streamed text output
|
|
|
|
|
if etype == "response.output_text.delta" and getattr(event, "delta", None):
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Collect streamed tool call arguments
|
|
|
|
|
if etype == "response.function_call_arguments.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
tool_calls.append(event.delta)
|
|
|
|
|
|
|
|
|
|
full_text = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Responses tools streaming test")
|
|
|
|
|
print(f"Streamed text: {full_text}")
|
|
|
|
|
print(f"Tool call argument chunks: {len(tool_calls)}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
# We expect either streamed text output or streamed tool-call arguments
|
|
|
|
|
assert (
|
|
|
|
|
full_text or tool_calls
|
|
|
|
|
), "Expected streamed text or tool call argument deltas from Responses tools stream"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_upstream_bedrock():
|
|
|
|
|
"""Send a v1/responses request using the coding-model alias to verify Bedrock translation/routing"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="coding-model",
|
|
|
|
|
input="Hello, translate this via coding-model alias to Bedrock",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Print the response content - handle both responses format and chat completions format
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert resp is not None
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_with_streaming_upstream_bedrock():
|
|
|
|
|
"""Build a v1/responses API streaming request routed to Bedrock via coding-model alias"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
# Simple streaming responses API request using coding-model alias
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="coding-model",
|
|
|
|
|
input="Write a short haiku about coding",
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content using the official Responses API streaming shape
|
|
|
|
|
text_chunks = []
|
|
|
|
|
final_message = None
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
# The Python SDK surfaces a high-level Responses streaming interface.
|
|
|
|
|
# We rely on its typed helpers instead of digging into model_extra.
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
# Each delta contains a text fragment
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Track the final response message if provided by the SDK
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
final_message = event.response
|
|
|
|
|
|
|
|
|
|
full_content = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
# Print the streaming response
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(
|
|
|
|
|
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
|
|
|
|
|
)
|
|
|
|
|
print(f"Streamed Output: {full_content}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert len(text_chunks) > 0, "Should have received streaming text deltas"
|
|
|
|
|
assert len(full_content) > 0, "Should have received content"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_with_tools_upstream_bedrock():
|
|
|
|
|
"""Responses API with tools routed to Bedrock via coding-model alias"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="coding-model",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_streaming_with_tools_upstream_bedrock():
|
|
|
|
|
"""Responses API with a function/tool definition streaming to Bedrock via coding-model alias"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="coding-model",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
text_chunks = []
|
|
|
|
|
tool_calls = []
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
etype = getattr(event, "type", None)
|
|
|
|
|
|
|
|
|
|
# Collect streamed text output
|
|
|
|
|
if etype == "response.output_text.delta" and getattr(event, "delta", None):
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Collect streamed tool call arguments
|
|
|
|
|
if etype == "response.function_call_arguments.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
tool_calls.append(event.delta)
|
|
|
|
|
|
|
|
|
|
full_text = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Responses tools streaming test (Bedrock)")
|
|
|
|
|
print(f"Streamed text: {full_text}")
|
|
|
|
|
print(f"Tool call argument chunks: {len(tool_calls)}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
# We expect either streamed text output or streamed tool-call arguments
|
|
|
|
|
assert (
|
|
|
|
|
full_text or tool_calls
|
|
|
|
|
), "Expected streamed text or tool call argument deltas from Responses tools stream"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_upstream_anthropic():
|
|
|
|
|
"""Send a v1/responses request using the grok alias to verify translation/routing"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514", input="Hello, translate this via grok alias"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Print the response content - handle both responses format and chat completions format
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert resp is not None
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_with_streaming_upstream_anthropic():
|
|
|
|
|
"""Build a v1/responses API streaming request (pass-through) and ensure gateway accepts it"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
# Simple streaming responses API request using a direct model (pass-through)
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="Write a short haiku about coding",
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content using the official Responses API streaming shape
|
|
|
|
|
text_chunks = []
|
|
|
|
|
final_message = None
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
# The Python SDK surfaces a high-level Responses streaming interface.
|
|
|
|
|
# We rely on its typed helpers instead of digging into model_extra.
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
# Each delta contains a text fragment
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Track the final response message if provided by the SDK
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
final_message = event.response
|
|
|
|
|
|
|
|
|
|
full_content = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
# Print the streaming response
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(
|
|
|
|
|
f"Model: {getattr(final_message, 'model', 'unknown') if final_message else 'unknown'}"
|
|
|
|
|
)
|
|
|
|
|
print(f"Streamed Output: {full_content}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert len(text_chunks) > 0, "Should have received streaming text deltas"
|
|
|
|
|
assert len(full_content) > 0, "Should have received content"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_non_streaming_with_tools_upstream_anthropic():
|
|
|
|
|
"""Responses API with tools routed to grok via alias"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input: hello_world",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
resp = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert resp.id is not None
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print(f"Model: {resp.model}")
|
|
|
|
|
print(f"Output: {resp.output_text}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_openai_responses_api_streaming_with_tools_upstream_anthropic():
|
|
|
|
|
"""Responses API with a function/tool definition (streaming, pass-through)"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1", max_retries=0)
|
|
|
|
|
|
|
|
|
|
tools = [
|
|
|
|
|
{
|
|
|
|
|
"type": "function",
|
|
|
|
|
"name": "echo_tool",
|
|
|
|
|
"description": "Echo back the provided input: hello_world",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {"text": {"type": "string"}},
|
|
|
|
|
"required": ["text"],
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
stream = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="Call the echo tool",
|
|
|
|
|
tools=tools,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
text_chunks = []
|
|
|
|
|
tool_calls = []
|
|
|
|
|
|
|
|
|
|
for event in stream:
|
|
|
|
|
etype = getattr(event, "type", None)
|
|
|
|
|
|
|
|
|
|
# Collect streamed text output
|
|
|
|
|
if etype == "response.output_text.delta" and getattr(event, "delta", None):
|
|
|
|
|
text_chunks.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Collect streamed tool call arguments
|
|
|
|
|
if etype == "response.function_call_arguments.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
tool_calls.append(event.delta)
|
|
|
|
|
|
|
|
|
|
full_text = "".join(text_chunks)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Responses tools streaming test")
|
|
|
|
|
print(f"Streamed text: {full_text}")
|
|
|
|
|
print(f"Tool call argument chunks: {len(tool_calls)}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
# We expect either streamed text output or streamed tool-call arguments
|
|
|
|
|
assert (
|
|
|
|
|
full_text or tool_calls
|
|
|
|
|
), "Expected streamed text or tool call argument deltas from Responses tools stream"
|
2025-12-14 22:21:00 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_conversation_state_management_two_turn():
|
|
|
|
|
"""
|
|
|
|
|
Test conversation state management across two turns:
|
|
|
|
|
1. Send initial message to non-OpenAI model via v1/responses
|
|
|
|
|
2. Capture response_id from first response
|
|
|
|
|
3. Send second message with previous_response_id
|
|
|
|
|
4. Verify model receives both messages in correct order
|
|
|
|
|
"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
logger.info("\n" + "=" * 80)
|
|
|
|
|
logger.info("TEST: Conversation State Management - Two Turn Flow")
|
|
|
|
|
logger.info("=" * 80)
|
|
|
|
|
|
|
|
|
|
# Turn 1: Send initial message to Anthropic (non-OpenAI model)
|
|
|
|
|
logger.info("\n[TURN 1] Sending initial message...")
|
|
|
|
|
resp1 = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="My name is Alice and I like pizza.",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Extract response_id from first response
|
|
|
|
|
response_id_1 = resp1.id
|
|
|
|
|
logger.info(f"[TURN 1] Received response_id: {response_id_1}")
|
|
|
|
|
logger.info(f"[TURN 1] Model response: {resp1.output_text}")
|
|
|
|
|
|
|
|
|
|
assert response_id_1 is not None, "First response should have an id"
|
|
|
|
|
assert len(resp1.output_text) > 0, "First response should have content"
|
|
|
|
|
|
|
|
|
|
# Turn 2: Send follow-up message with previous_response_id
|
|
|
|
|
# Ask the model to list all messages to verify state was combined
|
|
|
|
|
logger.info(
|
|
|
|
|
f"\n[TURN 2] Sending follow-up with previous_response_id={response_id_1}"
|
|
|
|
|
)
|
|
|
|
|
resp2 = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="Please list all the messages you have received in our conversation, numbering each one.",
|
|
|
|
|
previous_response_id=response_id_1,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
response_id_2 = resp2.id
|
|
|
|
|
logger.info(f"[TURN 2] Received response_id: {response_id_2}")
|
|
|
|
|
logger.info(f"[TURN 2] Model response: {resp2.output_text}")
|
|
|
|
|
|
|
|
|
|
assert response_id_2 is not None, "Second response should have an id"
|
|
|
|
|
assert response_id_2 != response_id_1, "Second response should have different id"
|
|
|
|
|
|
|
|
|
|
# Verify the model received the conversation history
|
|
|
|
|
# The response should reference both the initial message and the follow-up
|
|
|
|
|
response_lower = resp2.output_text.lower()
|
|
|
|
|
|
|
|
|
|
# Check if the model acknowledges receiving multiple messages
|
|
|
|
|
# Different models might format this differently, so we check for various indicators
|
|
|
|
|
has_conversation_context = (
|
|
|
|
|
"alice" in response_lower
|
|
|
|
|
or "pizza" in response_lower # References the name from turn 1
|
|
|
|
|
or "two" in response_lower # References the preference from turn 1
|
|
|
|
|
or "2" in response_lower # Mentions number of messages
|
|
|
|
|
or "first" in response_lower # Numeric indicator
|
|
|
|
|
or "second" # References first message
|
|
|
|
|
in response_lower # References second message
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
f"\n[VALIDATION] Conversation context preserved: {has_conversation_context}"
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"[VALIDATION] Response contains conversation markers: {has_conversation_context}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Conversation State Test Results:")
|
|
|
|
|
print(f"Turn 1 Response ID: {response_id_1}")
|
|
|
|
|
print(f"Turn 2 Response ID: {response_id_2}")
|
|
|
|
|
print(f"Turn 1 Output: {resp1.output_text[:100]}...")
|
|
|
|
|
print(f"Turn 2 Output: {resp2.output_text}")
|
|
|
|
|
print(f"Conversation Context Preserved: {has_conversation_context}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert has_conversation_context, (
|
|
|
|
|
f"Model should have received conversation history. "
|
|
|
|
|
f"Response: {resp2.output_text}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_conversation_state_management_two_turn_streaming():
|
|
|
|
|
"""
|
|
|
|
|
Test conversation state management across two turns with streaming:
|
|
|
|
|
1. Send initial streaming message to non-OpenAI model via v1/responses
|
|
|
|
|
2. Capture response_id from first response
|
|
|
|
|
3. Send second streaming message with previous_response_id
|
|
|
|
|
4. Verify model receives both messages in correct order
|
|
|
|
|
"""
|
|
|
|
|
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
|
|
|
|
client = openai.OpenAI(api_key="test-key", base_url=f"{base_url}/v1")
|
|
|
|
|
|
|
|
|
|
logger.info("\n" + "=" * 80)
|
|
|
|
|
logger.info("TEST: Conversation State Management - Two Turn Streaming Flow")
|
|
|
|
|
logger.info("=" * 80)
|
|
|
|
|
|
|
|
|
|
# Turn 1: Send initial streaming message to Anthropic (non-OpenAI model)
|
|
|
|
|
logger.info("\n[TURN 1] Sending initial streaming message...")
|
|
|
|
|
stream1 = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="My name is Alice and I like pizza.",
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content and capture response_id
|
|
|
|
|
text_chunks_1 = []
|
|
|
|
|
response_id_1 = None
|
|
|
|
|
|
|
|
|
|
for event in stream1:
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
text_chunks_1.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Capture response_id from response.completed event
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
response_id_1 = event.response.id
|
|
|
|
|
|
|
|
|
|
output_1 = "".join(text_chunks_1)
|
|
|
|
|
logger.info(f"[TURN 1] Received response_id: {response_id_1}")
|
|
|
|
|
logger.info(f"[TURN 1] Model response: {output_1}")
|
|
|
|
|
|
|
|
|
|
assert response_id_1 is not None, "First response should have an id"
|
|
|
|
|
assert len(output_1) > 0, "First response should have content"
|
|
|
|
|
|
|
|
|
|
# Turn 2: Send follow-up streaming message with previous_response_id
|
|
|
|
|
logger.info(
|
|
|
|
|
f"\n[TURN 2] Sending follow-up streaming request with previous_response_id={response_id_1}"
|
|
|
|
|
)
|
|
|
|
|
stream2 = client.responses.create(
|
|
|
|
|
model="claude-sonnet-4-20250514",
|
|
|
|
|
input="Please list all the messages you have received in our conversation, numbering each one.",
|
|
|
|
|
previous_response_id=response_id_1,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Collect streamed content from second response
|
|
|
|
|
text_chunks_2 = []
|
|
|
|
|
response_id_2 = None
|
|
|
|
|
|
|
|
|
|
for event in stream2:
|
|
|
|
|
if getattr(event, "type", None) == "response.output_text.delta" and getattr(
|
|
|
|
|
event, "delta", None
|
|
|
|
|
):
|
|
|
|
|
text_chunks_2.append(event.delta)
|
|
|
|
|
|
|
|
|
|
# Capture response_id from response.completed event
|
|
|
|
|
if getattr(event, "type", None) == "response.completed" and getattr(
|
|
|
|
|
event, "response", None
|
|
|
|
|
):
|
|
|
|
|
response_id_2 = event.response.id
|
|
|
|
|
|
|
|
|
|
output_2 = "".join(text_chunks_2)
|
|
|
|
|
logger.info(f"[TURN 2] Received response_id: {response_id_2}")
|
|
|
|
|
logger.info(f"[TURN 2] Model response: {output_2}")
|
|
|
|
|
|
|
|
|
|
assert response_id_2 is not None, "Second response should have an id"
|
|
|
|
|
assert response_id_2 != response_id_1, "Second response should have different id"
|
|
|
|
|
|
|
|
|
|
# Verify the model received the conversation history
|
|
|
|
|
response_lower = output_2.lower()
|
|
|
|
|
|
|
|
|
|
# Check if the model acknowledges receiving multiple messages
|
|
|
|
|
has_conversation_context = (
|
|
|
|
|
"alice" in response_lower
|
|
|
|
|
or "pizza" in response_lower # References the name from turn 1
|
|
|
|
|
or "two" in response_lower # References the preference from turn 1
|
|
|
|
|
or "2" in response_lower # Mentions number of messages
|
|
|
|
|
or "first" in response_lower # Numeric indicator
|
|
|
|
|
or "second" # References first message
|
|
|
|
|
in response_lower # References second message
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
f"\n[VALIDATION] Conversation context preserved: {has_conversation_context}"
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"[VALIDATION] Response contains conversation markers: {has_conversation_context}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
print(f"\n{'='*80}")
|
|
|
|
|
print("Streaming Conversation State Test Results:")
|
|
|
|
|
print(f"Turn 1 Response ID: {response_id_1}")
|
|
|
|
|
print(f"Turn 2 Response ID: {response_id_2}")
|
|
|
|
|
print(f"Turn 1 Output: {output_1[:100]}...")
|
|
|
|
|
print(f"Turn 2 Output: {output_2}")
|
|
|
|
|
print(f"Conversation Context Preserved: {has_conversation_context}")
|
|
|
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
|
|
|
|
assert has_conversation_context, (
|
|
|
|
|
f"Model should have received conversation history. " f"Response: {output_2}"
|
|
|
|
|
)
|