mirror of
https://github.com/katanemo/plano.git
synced 2026-06-05 14:45:15 +02:00
1436 lines
51 KiB
Python
1436 lines
51 KiB
Python
|
|
"""
|
||
|
|
Integration tests for retry-on-ratelimit feature (P0).
|
||
|
|
|
||
|
|
Tests IT-1 through IT-6, IT-12, IT-13 validate end-to-end retry behavior
|
||
|
|
through the real Plano gateway using Python mock HTTP servers as upstream providers.
|
||
|
|
|
||
|
|
Each test:
|
||
|
|
1. Starts mock upstream servers on ephemeral ports
|
||
|
|
2. Writes a YAML config pointing the gateway at those mock ports
|
||
|
|
3. Starts the gateway via `planoai up`
|
||
|
|
4. Sends requests and asserts on response status/body/timing
|
||
|
|
5. Tears down the gateway via `planoai down`
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
import tempfile
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
import requests
|
||
|
|
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||
|
|
handlers=[logging.StreamHandler(sys.stdout)],
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
GATEWAY_BASE_URL = "http://localhost:12000"
|
||
|
|
GATEWAY_CHAT_URL = f"{GATEWAY_BASE_URL}/v1/chat/completions"
|
||
|
|
CONFIGS_DIR = os.path.join(os.path.dirname(__file__), "configs")
|
||
|
|
|
||
|
|
# Standard OpenAI-compatible success response body
|
||
|
|
SUCCESS_RESPONSE = json.dumps({
|
||
|
|
"id": "chatcmpl-test-001",
|
||
|
|
"object": "chat.completion",
|
||
|
|
"created": 1700000000,
|
||
|
|
"model": "mock-model",
|
||
|
|
"choices": [
|
||
|
|
{
|
||
|
|
"index": 0,
|
||
|
|
"message": {
|
||
|
|
"role": "assistant",
|
||
|
|
"content": "Hello from mock provider!",
|
||
|
|
},
|
||
|
|
"finish_reason": "stop",
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
|
||
|
|
})
|
||
|
|
|
||
|
|
# Standard chat request body
|
||
|
|
CHAT_REQUEST_BODY = {
|
||
|
|
"model": "openai/gpt-4o",
|
||
|
|
"messages": [{"role": "user", "content": "Hello"}],
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Mock upstream server infrastructure
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class MockUpstreamHandler(BaseHTTPRequestHandler):
|
||
|
|
"""
|
||
|
|
Configurable mock HTTP handler that returns responses from a per-server queue.
|
||
|
|
|
||
|
|
Each server instance has a response_queue (list of tuples):
|
||
|
|
(status_code, headers_dict, body_string)
|
||
|
|
|
||
|
|
Responses are consumed in order. When the queue is exhausted, the last
|
||
|
|
response is repeated. The handler also records all received requests for
|
||
|
|
later assertion.
|
||
|
|
"""
|
||
|
|
|
||
|
|
# These are set per-server-instance via the factory function below.
|
||
|
|
response_queue: list = []
|
||
|
|
received_requests: list = []
|
||
|
|
call_count: int = 0
|
||
|
|
lock: threading.Lock = threading.Lock()
|
||
|
|
|
||
|
|
def do_POST(self):
|
||
|
|
content_length = int(self.headers.get("Content-Length", 0))
|
||
|
|
body = self.rfile.read(content_length) if content_length > 0 else b""
|
||
|
|
|
||
|
|
with self.__class__.lock:
|
||
|
|
self.__class__.call_count += 1
|
||
|
|
self.__class__.received_requests.append({
|
||
|
|
"path": self.path,
|
||
|
|
"headers": dict(self.headers),
|
||
|
|
"body": body.decode("utf-8", errors="replace"),
|
||
|
|
})
|
||
|
|
idx = min(
|
||
|
|
self.__class__.call_count - 1,
|
||
|
|
len(self.__class__.response_queue) - 1,
|
||
|
|
)
|
||
|
|
status_code, headers, response_body = self.__class__.response_queue[idx]
|
||
|
|
|
||
|
|
self.send_response(status_code)
|
||
|
|
for key, value in headers.items():
|
||
|
|
self.send_header(key, value)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
if isinstance(response_body, str):
|
||
|
|
response_body = response_body.encode("utf-8")
|
||
|
|
self.wfile.write(response_body)
|
||
|
|
|
||
|
|
def do_GET(self):
|
||
|
|
"""Handle health checks or other GET requests."""
|
||
|
|
self.send_response(200)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(b'{"status": "ok"}')
|
||
|
|
|
||
|
|
def log_message(self, format, *args):
|
||
|
|
"""Suppress default request logging to reduce noise."""
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def create_mock_handler_class(response_queue: list) -> type:
|
||
|
|
"""
|
||
|
|
Create a new handler class with its own response queue and state.
|
||
|
|
This avoids shared state between different mock servers.
|
||
|
|
"""
|
||
|
|
class Handler(MockUpstreamHandler):
|
||
|
|
pass
|
||
|
|
|
||
|
|
Handler.response_queue = list(response_queue)
|
||
|
|
Handler.received_requests = []
|
||
|
|
Handler.call_count = 0
|
||
|
|
Handler.lock = threading.Lock()
|
||
|
|
return Handler
|
||
|
|
|
||
|
|
|
||
|
|
class MockServer:
|
||
|
|
"""Manages a mock HTTP server running in a background thread."""
|
||
|
|
|
||
|
|
def __init__(self, response_queue: list):
|
||
|
|
self.handler_class = create_mock_handler_class(response_queue)
|
||
|
|
self.server = HTTPServer(("0.0.0.0", 0), self.handler_class)
|
||
|
|
self.port = self.server.server_address[1]
|
||
|
|
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
self.thread.start()
|
||
|
|
logger.info(f"Mock server started on port {self.port}")
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.server.shutdown()
|
||
|
|
self.thread.join(timeout=5)
|
||
|
|
logger.info(f"Mock server stopped on port {self.port}")
|
||
|
|
|
||
|
|
@property
|
||
|
|
def call_count(self) -> int:
|
||
|
|
return self.handler_class.call_count
|
||
|
|
|
||
|
|
@property
|
||
|
|
def received_requests(self) -> list:
|
||
|
|
return self.handler_class.received_requests
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Gateway lifecycle helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def write_config(template_name: str, substitutions: dict) -> str:
|
||
|
|
"""
|
||
|
|
Read a config template from configs/ dir, apply port substitutions,
|
||
|
|
and write to a temp file. Returns the path to the temp config file.
|
||
|
|
"""
|
||
|
|
template_path = os.path.join(CONFIGS_DIR, template_name)
|
||
|
|
with open(template_path, "r") as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
for key, value in substitutions.items():
|
||
|
|
content = content.replace(f"${{{key}}}", str(value))
|
||
|
|
|
||
|
|
# Write to a temp file in the e2e directory so planoai can find it
|
||
|
|
fd, config_path = tempfile.mkstemp(suffix=".yaml", prefix="retry_test_")
|
||
|
|
with os.fdopen(fd, "w") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
logger.info(f"Wrote test config to {config_path}")
|
||
|
|
return config_path
|
||
|
|
|
||
|
|
|
||
|
|
def gateway_up(config_path: str, timeout: int = 30):
|
||
|
|
"""Start the Plano gateway with the given config. Waits for health."""
|
||
|
|
logger.info(f"Starting gateway with config: {config_path}")
|
||
|
|
subprocess.run(
|
||
|
|
["planoai", "down", "--docker"],
|
||
|
|
capture_output=True,
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
result = subprocess.run(
|
||
|
|
["planoai", "up", "--docker", config_path],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
timeout=60,
|
||
|
|
)
|
||
|
|
if result.returncode != 0:
|
||
|
|
logger.error(f"planoai up failed: {result.stderr}")
|
||
|
|
raise RuntimeError(f"planoai up failed: {result.stderr}")
|
||
|
|
|
||
|
|
# Wait for gateway to be healthy
|
||
|
|
start = time.time()
|
||
|
|
while time.time() - start < timeout:
|
||
|
|
try:
|
||
|
|
resp = requests.get(f"{GATEWAY_BASE_URL}/healthz", timeout=2)
|
||
|
|
if resp.status_code == 200:
|
||
|
|
logger.info("Gateway is healthy")
|
||
|
|
return
|
||
|
|
except requests.ConnectionError:
|
||
|
|
pass
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
raise RuntimeError(f"Gateway did not become healthy within {timeout}s")
|
||
|
|
|
||
|
|
|
||
|
|
def gateway_down():
|
||
|
|
"""Stop the Plano gateway."""
|
||
|
|
logger.info("Stopping gateway")
|
||
|
|
subprocess.run(
|
||
|
|
["planoai", "down", "--docker"],
|
||
|
|
capture_output=True,
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def make_error_response(status_code: int, message: str = "error") -> str:
|
||
|
|
"""Create a JSON error response body."""
|
||
|
|
return json.dumps({
|
||
|
|
"error": {
|
||
|
|
"message": message,
|
||
|
|
"type": "server_error",
|
||
|
|
"code": str(status_code),
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Streaming helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
STREAMING_SUCCESS_CHUNKS = [
|
||
|
|
'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}\n\n',
|
||
|
|
'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"content":" from"},"finish_reason":null}]}\n\n',
|
||
|
|
'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"content":" stream!"},"finish_reason":null}]}\n\n',
|
||
|
|
'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n',
|
||
|
|
"data: [DONE]\n\n",
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
class StreamingMockHandler(MockUpstreamHandler):
|
||
|
|
"""Handler that returns SSE streaming responses."""
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def create_streaming_handler_class(
|
||
|
|
response_queue: list,
|
||
|
|
streaming_chunks: Optional[list] = None,
|
||
|
|
) -> type:
|
||
|
|
"""
|
||
|
|
Create a handler class that can return streaming SSE responses.
|
||
|
|
|
||
|
|
response_queue entries can include a special "STREAM" body marker
|
||
|
|
to trigger streaming mode with the provided chunks.
|
||
|
|
"""
|
||
|
|
chunks = streaming_chunks or STREAMING_SUCCESS_CHUNKS
|
||
|
|
|
||
|
|
class Handler(StreamingMockHandler):
|
||
|
|
pass
|
||
|
|
|
||
|
|
Handler.response_queue = list(response_queue)
|
||
|
|
Handler.received_requests = []
|
||
|
|
Handler.call_count = 0
|
||
|
|
Handler.lock = threading.Lock()
|
||
|
|
|
||
|
|
original_do_post = Handler.do_POST
|
||
|
|
|
||
|
|
def streaming_do_post(self):
|
||
|
|
content_length = int(self.headers.get("Content-Length", 0))
|
||
|
|
body = self.rfile.read(content_length) if content_length > 0 else b""
|
||
|
|
|
||
|
|
with Handler.lock:
|
||
|
|
Handler.call_count += 1
|
||
|
|
Handler.received_requests.append({
|
||
|
|
"path": self.path,
|
||
|
|
"headers": dict(self.headers),
|
||
|
|
"body": body.decode("utf-8", errors="replace"),
|
||
|
|
})
|
||
|
|
idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1)
|
||
|
|
status_code, headers, response_body = Handler.response_queue[idx]
|
||
|
|
|
||
|
|
if response_body == "STREAM":
|
||
|
|
self.send_response(status_code)
|
||
|
|
for key, value in headers.items():
|
||
|
|
self.send_header(key, value)
|
||
|
|
self.send_header("Content-Type", "text/event-stream")
|
||
|
|
self.send_header("Transfer-Encoding", "chunked")
|
||
|
|
self.end_headers()
|
||
|
|
for chunk in chunks:
|
||
|
|
self.wfile.write(chunk.encode("utf-8"))
|
||
|
|
self.wfile.flush()
|
||
|
|
time.sleep(0.05)
|
||
|
|
else:
|
||
|
|
self.send_response(status_code)
|
||
|
|
for key, value in headers.items():
|
||
|
|
self.send_header(key, value)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
if isinstance(response_body, str):
|
||
|
|
response_body = response_body.encode("utf-8")
|
||
|
|
self.wfile.write(response_body)
|
||
|
|
|
||
|
|
Handler.do_POST = streaming_do_post
|
||
|
|
return Handler
|
||
|
|
|
||
|
|
|
||
|
|
class StreamingMockServer:
|
||
|
|
"""Mock server that supports streaming responses."""
|
||
|
|
|
||
|
|
def __init__(self, response_queue: list, streaming_chunks: Optional[list] = None):
|
||
|
|
self.handler_class = create_streaming_handler_class(
|
||
|
|
response_queue, streaming_chunks
|
||
|
|
)
|
||
|
|
self.server = HTTPServer(("0.0.0.0", 0), self.handler_class)
|
||
|
|
self.port = self.server.server_address[1]
|
||
|
|
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
self.thread.start()
|
||
|
|
logger.info(f"Streaming mock server started on port {self.port}")
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.server.shutdown()
|
||
|
|
self.thread.join(timeout=5)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def call_count(self) -> int:
|
||
|
|
return self.handler_class.call_count
|
||
|
|
|
||
|
|
@property
|
||
|
|
def received_requests(self) -> list:
|
||
|
|
return self.handler_class.received_requests
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Body-echo handler for IT-13
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def create_echo_handler_class(response_queue: list) -> type:
|
||
|
|
"""
|
||
|
|
Create a handler that echoes the received request body back in the
|
||
|
|
response, wrapped in a valid chat completion response.
|
||
|
|
The response_queue controls status codes — when the status is 200,
|
||
|
|
the handler echoes the body; otherwise it returns the queued response.
|
||
|
|
"""
|
||
|
|
|
||
|
|
class Handler(MockUpstreamHandler):
|
||
|
|
pass
|
||
|
|
|
||
|
|
Handler.response_queue = list(response_queue)
|
||
|
|
Handler.received_requests = []
|
||
|
|
Handler.call_count = 0
|
||
|
|
Handler.lock = threading.Lock()
|
||
|
|
|
||
|
|
def echo_do_post(self):
|
||
|
|
content_length = int(self.headers.get("Content-Length", 0))
|
||
|
|
body = self.rfile.read(content_length) if content_length > 0 else b""
|
||
|
|
|
||
|
|
with Handler.lock:
|
||
|
|
Handler.call_count += 1
|
||
|
|
Handler.received_requests.append({
|
||
|
|
"path": self.path,
|
||
|
|
"headers": dict(self.headers),
|
||
|
|
"body": body.decode("utf-8", errors="replace"),
|
||
|
|
})
|
||
|
|
idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1)
|
||
|
|
status_code, headers, response_body = Handler.response_queue[idx]
|
||
|
|
|
||
|
|
if status_code == 200:
|
||
|
|
# Echo the received body inside a chat completion response
|
||
|
|
echo_response = json.dumps({
|
||
|
|
"id": "chatcmpl-echo-001",
|
||
|
|
"object": "chat.completion",
|
||
|
|
"created": 1700000000,
|
||
|
|
"model": "echo-model",
|
||
|
|
"choices": [
|
||
|
|
{
|
||
|
|
"index": 0,
|
||
|
|
"message": {
|
||
|
|
"role": "assistant",
|
||
|
|
"content": body.decode("utf-8", errors="replace"),
|
||
|
|
},
|
||
|
|
"finish_reason": "stop",
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"usage": {
|
||
|
|
"prompt_tokens": 10,
|
||
|
|
"completion_tokens": 5,
|
||
|
|
"total_tokens": 15,
|
||
|
|
},
|
||
|
|
})
|
||
|
|
self.send_response(200)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(echo_response.encode("utf-8"))
|
||
|
|
else:
|
||
|
|
self.send_response(status_code)
|
||
|
|
for key, value in headers.items():
|
||
|
|
self.send_header(key, value)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
if isinstance(response_body, str):
|
||
|
|
response_body = response_body.encode("utf-8")
|
||
|
|
self.wfile.write(response_body)
|
||
|
|
|
||
|
|
Handler.do_POST = echo_do_post
|
||
|
|
return Handler
|
||
|
|
|
||
|
|
|
||
|
|
class EchoMockServer:
|
||
|
|
"""Mock server that echoes request body on 200 responses."""
|
||
|
|
|
||
|
|
def __init__(self, response_queue: list):
|
||
|
|
self.handler_class = create_echo_handler_class(response_queue)
|
||
|
|
self.server = HTTPServer(("0.0.0.0", 0), self.handler_class)
|
||
|
|
self.port = self.server.server_address[1]
|
||
|
|
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
self.thread.start()
|
||
|
|
logger.info(f"Echo mock server started on port {self.port}")
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.server.shutdown()
|
||
|
|
self.thread.join(timeout=5)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def call_count(self) -> int:
|
||
|
|
return self.handler_class.call_count
|
||
|
|
|
||
|
|
@property
|
||
|
|
def received_requests(self) -> list:
|
||
|
|
return self.handler_class.received_requests
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Delayed-response handler for IT-10 (timeout triggers retry)
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def create_delayed_handler_class(response_queue: list, delay_seconds: float) -> type:
|
||
|
|
"""
|
||
|
|
Create a handler class that delays its response by *delay_seconds* before
|
||
|
|
sending the queued response. Used to simulate upstream timeouts.
|
||
|
|
"""
|
||
|
|
|
||
|
|
class Handler(MockUpstreamHandler):
|
||
|
|
pass
|
||
|
|
|
||
|
|
Handler.response_queue = list(response_queue)
|
||
|
|
Handler.received_requests = []
|
||
|
|
Handler.call_count = 0
|
||
|
|
Handler.lock = threading.Lock()
|
||
|
|
|
||
|
|
def delayed_do_post(self):
|
||
|
|
content_length = int(self.headers.get("Content-Length", 0))
|
||
|
|
body = self.rfile.read(content_length) if content_length > 0 else b""
|
||
|
|
|
||
|
|
with Handler.lock:
|
||
|
|
Handler.call_count += 1
|
||
|
|
Handler.received_requests.append({
|
||
|
|
"path": self.path,
|
||
|
|
"headers": dict(self.headers),
|
||
|
|
"body": body.decode("utf-8", errors="replace"),
|
||
|
|
})
|
||
|
|
idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1)
|
||
|
|
status_code, headers, response_body = Handler.response_queue[idx]
|
||
|
|
|
||
|
|
# Delay before responding — gateway should time out before this completes
|
||
|
|
time.sleep(delay_seconds)
|
||
|
|
|
||
|
|
self.send_response(status_code)
|
||
|
|
for key, value in headers.items():
|
||
|
|
self.send_header(key, value)
|
||
|
|
self.send_header("Content-Type", "application/json")
|
||
|
|
self.end_headers()
|
||
|
|
if isinstance(response_body, str):
|
||
|
|
response_body = response_body.encode("utf-8")
|
||
|
|
self.wfile.write(response_body)
|
||
|
|
|
||
|
|
Handler.do_POST = delayed_do_post
|
||
|
|
return Handler
|
||
|
|
|
||
|
|
|
||
|
|
class DelayedMockServer:
|
||
|
|
"""Mock server that delays responses to simulate slow upstreams / timeouts."""
|
||
|
|
|
||
|
|
def __init__(self, response_queue: list, delay_seconds: float):
|
||
|
|
self.handler_class = create_delayed_handler_class(
|
||
|
|
response_queue, delay_seconds
|
||
|
|
)
|
||
|
|
self.server = HTTPServer(("0.0.0.0", 0), self.handler_class)
|
||
|
|
self.port = self.server.server_address[1]
|
||
|
|
self.thread = threading.Thread(target=self.server.serve_forever, daemon=True)
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
self.thread.start()
|
||
|
|
logger.info(f"Delayed mock server started on port {self.port} ")
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.server.shutdown()
|
||
|
|
self.thread.join(timeout=5)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def call_count(self) -> int:
|
||
|
|
return self.handler_class.call_count
|
||
|
|
|
||
|
|
@property
|
||
|
|
def received_requests(self) -> list:
|
||
|
|
return self.handler_class.received_requests
|
||
|
|
|
||
|
|
|
||
|
|
# ===========================================================================
|
||
|
|
# Integration Tests
|
||
|
|
# ===========================================================================
|
||
|
|
|
||
|
|
|
||
|
|
class TestRetryIntegration:
|
||
|
|
"""
|
||
|
|
P0 integration tests for retry-on-ratelimit feature.
|
||
|
|
|
||
|
|
These tests require the full gateway infrastructure (Docker, planoai CLI).
|
||
|
|
Each test starts mock servers, configures the gateway, sends requests,
|
||
|
|
and validates retry behavior end-to-end.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def test_it1_basic_retry_on_429(self):
|
||
|
|
"""
|
||
|
|
IT-1: Basic retry on 429.
|
||
|
|
|
||
|
|
Primary mock returns 429, secondary returns 200.
|
||
|
|
Assert client gets 200 from the secondary provider.
|
||
|
|
"""
|
||
|
|
# Setup mock servers
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Write config with actual ports
|
||
|
|
config_path = write_config("retry_it1_basic_429.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
})
|
||
|
|
|
||
|
|
# Start gateway
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
# Send request
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: client gets 200 from secondary
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
assert "choices" in body
|
||
|
|
assert body["choices"][0]["message"]["content"] == "Hello from mock provider!"
|
||
|
|
|
||
|
|
# Assert: primary was called (got 429), secondary was called (returned 200)
|
||
|
|
assert primary.call_count >= 1, "Primary should have been called"
|
||
|
|
assert secondary.call_count >= 1, "Secondary should have been called"
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it2_retry_on_503_different_provider(self):
|
||
|
|
"""
|
||
|
|
IT-2: Retry on 503 with different_provider strategy.
|
||
|
|
|
||
|
|
Primary returns 503, secondary returns 200.
|
||
|
|
Assert client gets 200 from the secondary provider.
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(503, {}, make_error_response(503, "Service Unavailable")),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it2_503_different_provider.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
assert "choices" in body
|
||
|
|
assert primary.call_count >= 1
|
||
|
|
assert secondary.call_count >= 1
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it3_all_retries_exhausted(self):
|
||
|
|
"""
|
||
|
|
IT-3: All retries exhausted.
|
||
|
|
|
||
|
|
All mock providers return 429.
|
||
|
|
Assert client gets an error response with attempts list and total_attempts.
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it3_all_exhausted.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Should get an error response (429 or the gateway's retry_exhausted error)
|
||
|
|
assert resp.status_code >= 400, (
|
||
|
|
f"Expected error status but got {resp.status_code}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
|
||
|
|
# The error response should contain retry attempt details
|
||
|
|
error = body.get("error", {})
|
||
|
|
assert error.get("type") == "retry_exhausted", (
|
||
|
|
f"Expected retry_exhausted error type, got: {error}"
|
||
|
|
)
|
||
|
|
assert "attempts" in error, "Error should contain attempts list"
|
||
|
|
assert "total_attempts" in error, "Error should contain total_attempts"
|
||
|
|
assert error["total_attempts"] >= 2, (
|
||
|
|
f"Expected at least 2 total attempts, got {error['total_attempts']}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it4_no_retry_policy_no_retry(self):
|
||
|
|
"""
|
||
|
|
IT-4: No retry_policy → no retry.
|
||
|
|
|
||
|
|
Primary returns 429 with no retry_policy configured.
|
||
|
|
Assert client gets 429 directly (no retry to secondary).
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it4_no_retry_policy.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Should get 429 directly — no retry
|
||
|
|
assert resp.status_code == 429, (
|
||
|
|
f"Expected 429 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Secondary should NOT have been called
|
||
|
|
assert secondary.call_count == 0, (
|
||
|
|
f"Secondary should not be called without retry_policy, "
|
||
|
|
f"but was called {secondary.call_count} times"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it5_max_attempts_respected(self):
|
||
|
|
"""
|
||
|
|
IT-5: max_attempts respected.
|
||
|
|
|
||
|
|
Primary returns 429, max_attempts: 1.
|
||
|
|
Assert only 1 retry attempt is made, then error is returned.
|
||
|
|
The secondary also returns 429 to ensure we see the exhaustion.
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
tertiary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
tertiary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it5_max_attempts.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
"MOCK_TERTIARY_PORT": tertiary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
# With max_attempts: 1, only 1 retry should happen after the initial failure.
|
||
|
|
# Primary fails (429) → 1 retry to secondary (429) → exhausted.
|
||
|
|
# Tertiary should NOT be reached.
|
||
|
|
assert resp.status_code >= 400, (
|
||
|
|
f"Expected error status but got {resp.status_code}"
|
||
|
|
)
|
||
|
|
|
||
|
|
assert tertiary.call_count == 0, (
|
||
|
|
f"Tertiary should not be called with max_attempts=1, "
|
||
|
|
f"but was called {tertiary.call_count} times"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Total calls: primary (1) + secondary (1 retry) = 2
|
||
|
|
total_calls = primary.call_count + secondary.call_count
|
||
|
|
assert total_calls <= 2, (
|
||
|
|
f"Expected at most 2 total calls (1 original + 1 retry), "
|
||
|
|
f"got {total_calls}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
tertiary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it6_backoff_delay_observed(self):
|
||
|
|
"""
|
||
|
|
IT-6: Backoff delay observed.
|
||
|
|
|
||
|
|
Configure same_model strategy with backoff (base_ms: 500, jitter: false).
|
||
|
|
Primary returns 429 twice, then 200 on third attempt.
|
||
|
|
Assert total response time includes backoff delays.
|
||
|
|
|
||
|
|
With base_ms=500 and no jitter:
|
||
|
|
- Attempt 1: fail (429)
|
||
|
|
- Backoff: 500ms (500 * 2^0)
|
||
|
|
- Attempt 2: fail (429)
|
||
|
|
- Backoff: 1000ms (500 * 2^1)
|
||
|
|
- Attempt 3: success (200)
|
||
|
|
Total backoff >= 1500ms (500 + 1000)
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it6_backoff_delay.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=60,
|
||
|
|
)
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# With base_ms=500 and no jitter, backoff should be at least:
|
||
|
|
# 500ms (attempt 1→2) + 1000ms (attempt 2→3) = 1500ms
|
||
|
|
# Use a slightly lower threshold (1.0s) to account for timing variance
|
||
|
|
min_expected_delay = 1.0 # seconds
|
||
|
|
assert elapsed >= min_expected_delay, (
|
||
|
|
f"Expected response time >= {min_expected_delay}s due to backoff, "
|
||
|
|
f"but got {elapsed:.2f}s"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Primary should have been called 3 times
|
||
|
|
assert primary.call_count == 3, (
|
||
|
|
f"Expected 3 calls to primary, got {primary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it12_streaming_preserved_across_retry(self):
|
||
|
|
"""
|
||
|
|
IT-12: Streaming request preserved across retry.
|
||
|
|
|
||
|
|
Primary returns 429, secondary returns 200 with SSE streaming.
|
||
|
|
Assert client receives a streamed response.
|
||
|
|
"""
|
||
|
|
# Primary always returns 429
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
# Secondary returns streaming 200
|
||
|
|
secondary_handler = create_streaming_handler_class([
|
||
|
|
(200, {}, "STREAM"),
|
||
|
|
])
|
||
|
|
secondary_server = HTTPServer(("0.0.0.0", 0), secondary_handler)
|
||
|
|
secondary_port = secondary_server.server_address[1]
|
||
|
|
secondary_thread = threading.Thread(
|
||
|
|
target=secondary_server.serve_forever, daemon=True
|
||
|
|
)
|
||
|
|
|
||
|
|
primary.start()
|
||
|
|
secondary_thread.start()
|
||
|
|
logger.info(f"Streaming secondary mock started on port {secondary_port}")
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it12_streaming.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary_port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
# Send a streaming request
|
||
|
|
streaming_body = dict(CHAT_REQUEST_BODY)
|
||
|
|
streaming_body["stream"] = True
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=streaming_body,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
stream=True,
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Collect streamed chunks
|
||
|
|
chunks = []
|
||
|
|
for line in resp.iter_lines(decode_unicode=True):
|
||
|
|
if line:
|
||
|
|
chunks.append(line)
|
||
|
|
|
||
|
|
# Should have received SSE data chunks
|
||
|
|
assert len(chunks) > 0, "Should have received streaming chunks"
|
||
|
|
|
||
|
|
# Verify at least one chunk contains "data:" prefix (SSE format)
|
||
|
|
data_chunks = [c for c in chunks if c.startswith("data:")]
|
||
|
|
assert len(data_chunks) > 0, (
|
||
|
|
f"Expected SSE data chunks, got: {chunks}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Verify the stream contains expected content
|
||
|
|
content_found = False
|
||
|
|
for chunk in data_chunks:
|
||
|
|
if chunk == "data: [DONE]":
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
payload = json.loads(chunk[len("data: "):])
|
||
|
|
delta = payload.get("choices", [{}])[0].get("delta", {})
|
||
|
|
if delta.get("content"):
|
||
|
|
content_found = True
|
||
|
|
except (json.JSONDecodeError, IndexError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
assert content_found, "Should have received content in streaming chunks"
|
||
|
|
|
||
|
|
# Primary should have been called (got 429)
|
||
|
|
assert primary.call_count >= 1, "Primary should have been called"
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary_server.shutdown()
|
||
|
|
secondary_thread.join(timeout=5)
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it13_request_body_preserved_across_retry(self):
|
||
|
|
"""
|
||
|
|
IT-13: Request body preserved across retry.
|
||
|
|
|
||
|
|
Primary returns 429, secondary echoes the request body.
|
||
|
|
Assert the echoed body matches the original request.
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
# Secondary echoes the request body
|
||
|
|
echo_server = EchoMockServer([
|
||
|
|
(200, {}, ""), # Status 200 triggers echo behavior
|
||
|
|
])
|
||
|
|
|
||
|
|
primary.start()
|
||
|
|
echo_server.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it13_body_preserved.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": echo_server.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
# Send request with a distinctive body
|
||
|
|
request_body = {
|
||
|
|
"model": "openai/gpt-4o",
|
||
|
|
"messages": [
|
||
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
||
|
|
{"role": "user", "content": "Tell me about retry mechanisms."},
|
||
|
|
],
|
||
|
|
"temperature": 0.7,
|
||
|
|
"max_tokens": 100,
|
||
|
|
}
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=request_body,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# The echo server received the request body — verify it was preserved
|
||
|
|
assert echo_server.call_count >= 1, "Echo server should have been called"
|
||
|
|
|
||
|
|
# Parse the body that the echo server received
|
||
|
|
received_body_str = echo_server.received_requests[-1]["body"]
|
||
|
|
received_body = json.loads(received_body_str)
|
||
|
|
|
||
|
|
# The gateway may modify the model field when routing to a different
|
||
|
|
# provider, but the messages and other fields should be preserved
|
||
|
|
assert received_body.get("messages") is not None, (
|
||
|
|
"Messages should be preserved in the forwarded request"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Verify the user message content is preserved
|
||
|
|
user_messages = [
|
||
|
|
m for m in received_body["messages"] if m.get("role") == "user"
|
||
|
|
]
|
||
|
|
assert len(user_messages) > 0, "User messages should be preserved"
|
||
|
|
assert user_messages[-1]["content"] == "Tell me about retry mechanisms.", (
|
||
|
|
f"User message content should be preserved, got: {user_messages[-1]}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Primary should have been called (got 429)
|
||
|
|
assert primary.call_count >= 1, "Primary should have been called"
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
echo_server.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
|
||
|
|
# -----------------------------------------------------------------------
|
||
|
|
# P1 Integration Tests (IT-7 through IT-10)
|
||
|
|
# -----------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_it7_fallback_models_priority(self):
|
||
|
|
"""
|
||
|
|
IT-7: Fallback models priority.
|
||
|
|
|
||
|
|
Primary mock returns 429, fallback[0] returns 429, fallback[1] returns 200.
|
||
|
|
Assert client gets 200 from fallback[1] and providers are tried in the
|
||
|
|
order defined by fallback_models.
|
||
|
|
|
||
|
|
Config: fallback_models: [anthropic/claude-3-5-sonnet, mistral/mistral-large]
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
fallback1 = MockServer([
|
||
|
|
(429, {}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
])
|
||
|
|
fallback2 = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
fallback1.start()
|
||
|
|
fallback2.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it7_fallback_priority.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_FALLBACK1_PORT": fallback1.port,
|
||
|
|
"MOCK_FALLBACK2_PORT": fallback2.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: client gets 200 from fallback[1]
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
assert "choices" in body
|
||
|
|
assert body["choices"][0]["message"]["content"] == "Hello from mock provider!"
|
||
|
|
|
||
|
|
# Assert: providers tried in order — primary, fallback[0], fallback[1]
|
||
|
|
assert primary.call_count >= 1, "Primary should have been called first"
|
||
|
|
assert fallback1.call_count >= 1, (
|
||
|
|
"Fallback[0] (anthropic/claude-3-5-sonnet) should have been tried "
|
||
|
|
"before fallback[1]"
|
||
|
|
)
|
||
|
|
assert fallback2.call_count >= 1, (
|
||
|
|
"Fallback[1] (mistral/mistral-large) should have been called"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
fallback1.stop()
|
||
|
|
fallback2.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it8_retry_after_header_honored(self):
|
||
|
|
"""
|
||
|
|
IT-8: Retry-After header honored.
|
||
|
|
|
||
|
|
Primary returns 429 + Retry-After: 2 on the first call, then 200 on the
|
||
|
|
second call (same_model strategy). Assert the total response time is
|
||
|
|
>= 2 seconds, proving the gateway waited for the Retry-After duration.
|
||
|
|
"""
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {"Retry-After": "2"}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it8_retry_after_honored.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
|
||
|
|
# Assert: client gets 200 after the retry
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
assert "choices" in body
|
||
|
|
|
||
|
|
# Assert: total time >= 2 seconds (Retry-After: 2 was honored)
|
||
|
|
# Use a slightly lower threshold to account for timing variance
|
||
|
|
min_expected_delay = 1.8 # seconds
|
||
|
|
assert elapsed >= min_expected_delay, (
|
||
|
|
f"Expected response time >= {min_expected_delay}s due to "
|
||
|
|
f"Retry-After: 2, but got {elapsed:.2f}s"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Primary should have been called twice (429 then 200)
|
||
|
|
assert primary.call_count == 2, (
|
||
|
|
f"Expected 2 calls to primary (429 + 200), got {primary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it9_retry_after_blocks_initial_selection(self):
|
||
|
|
"""
|
||
|
|
IT-9: Retry-After blocks initial selection.
|
||
|
|
|
||
|
|
First request: primary returns 429 + Retry-After: 60 and the gateway
|
||
|
|
retries to the secondary (which returns 200).
|
||
|
|
|
||
|
|
Second request (sent within 60s): because the primary is globally
|
||
|
|
blocked by the Retry-After state, the gateway should route directly
|
||
|
|
to the alternative provider without hitting the primary again.
|
||
|
|
"""
|
||
|
|
# Primary: first call returns 429 + Retry-After: 60, subsequent calls
|
||
|
|
# return 200 (but should not be reached for the second request).
|
||
|
|
primary = MockServer([
|
||
|
|
(429, {"Retry-After": "60"}, make_error_response(429, "Rate limit exceeded")),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config(
|
||
|
|
"retry_it9_retry_after_blocks_selection.yaml",
|
||
|
|
{
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
},
|
||
|
|
)
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
# --- First request: triggers the Retry-After state ---
|
||
|
|
resp1 = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
assert resp1.status_code == 200, (
|
||
|
|
f"First request: expected 200 but got {resp1.status_code}: {resp1.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
primary_calls_after_first = primary.call_count
|
||
|
|
secondary_calls_after_first = secondary.call_count
|
||
|
|
|
||
|
|
# Primary should have been called once (got 429), secondary once (got 200)
|
||
|
|
assert primary_calls_after_first >= 1, (
|
||
|
|
"Primary should have been called for the first request"
|
||
|
|
)
|
||
|
|
assert secondary_calls_after_first >= 1, (
|
||
|
|
"Secondary should have been called as fallback for the first request"
|
||
|
|
)
|
||
|
|
|
||
|
|
# --- Second request: within the 60s Retry-After window ---
|
||
|
|
# The primary model should be blocked globally, so the gateway
|
||
|
|
# should route to the alternative provider directly.
|
||
|
|
resp2 = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json={
|
||
|
|
"model": "openai/gpt-4o",
|
||
|
|
"messages": [{"role": "user", "content": "Second request"}],
|
||
|
|
},
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
assert resp2.status_code == 200, (
|
||
|
|
f"Second request: expected 200 but got {resp2.status_code}: {resp2.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: primary was NOT called again for the second request
|
||
|
|
# (it should still be blocked by the 60s Retry-After)
|
||
|
|
assert primary.call_count == primary_calls_after_first, (
|
||
|
|
f"Primary should not have been called for the second request "
|
||
|
|
f"(blocked by Retry-After: 60). Calls before: "
|
||
|
|
f"{primary_calls_after_first}, after: {primary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: secondary handled the second request
|
||
|
|
assert secondary.call_count > secondary_calls_after_first, (
|
||
|
|
f"Secondary should have handled the second request. "
|
||
|
|
f"Calls before: {secondary_calls_after_first}, "
|
||
|
|
f"after: {secondary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it10_timeout_triggers_retry(self):
|
||
|
|
"""
|
||
|
|
IT-10: Timeout triggers retry.
|
||
|
|
|
||
|
|
Primary mock delays its response beyond the gateway's request timeout.
|
||
|
|
Secondary returns 200 immediately.
|
||
|
|
Assert client gets 200 from the secondary provider.
|
||
|
|
"""
|
||
|
|
# Primary delays 120 seconds — well beyond any reasonable gateway timeout.
|
||
|
|
# The gateway should time out and retry to the secondary.
|
||
|
|
primary = DelayedMockServer(
|
||
|
|
response_queue=[
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
],
|
||
|
|
delay_seconds=120,
|
||
|
|
)
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config("retry_it10_timeout_triggers_retry.yaml", {
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
})
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
resp = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=120,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: client gets 200 from the secondary
|
||
|
|
assert resp.status_code == 200, (
|
||
|
|
f"Expected 200 but got {resp.status_code}: {resp.text}"
|
||
|
|
)
|
||
|
|
body = resp.json()
|
||
|
|
assert "choices" in body
|
||
|
|
assert body["choices"][0]["message"]["content"] == "Hello from mock provider!"
|
||
|
|
|
||
|
|
# Assert: primary was called (timed out), secondary was called (returned 200)
|
||
|
|
assert primary.call_count >= 1, (
|
||
|
|
"Primary should have been called (and timed out)"
|
||
|
|
)
|
||
|
|
assert secondary.call_count >= 1, (
|
||
|
|
"Secondary should have been called after primary timed out"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|
||
|
|
|
||
|
|
def test_it11_high_latency_proactive_failover(self):
|
||
|
|
"""
|
||
|
|
IT-11: High latency proactive failover.
|
||
|
|
|
||
|
|
First request: primary mock delays response by ~1.5s (threshold_ms=1000
|
||
|
|
+ 500ms buffer) but completes with 200 OK. The client receives the slow
|
||
|
|
200 response (completed responses are always delivered). However, the
|
||
|
|
gateway records a Latency_Block_State for the primary model.
|
||
|
|
|
||
|
|
Second request: sent immediately after the first. Because the primary
|
||
|
|
is now latency-blocked (block_duration_seconds=60, min_triggers=1),
|
||
|
|
the gateway should route directly to the secondary provider.
|
||
|
|
|
||
|
|
Config: on_high_latency with min_triggers: 1, threshold_ms: 1000,
|
||
|
|
block_duration_seconds: 60, measure: "total", scope: "model",
|
||
|
|
apply_to: "global".
|
||
|
|
"""
|
||
|
|
# Primary: delays 1.5s (exceeds 1000ms threshold), returns 200.
|
||
|
|
# Queue two responses in case the primary is called twice (it shouldn't
|
||
|
|
# be for the second request, but we need a response ready just in case).
|
||
|
|
primary = DelayedMockServer(
|
||
|
|
response_queue=[
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
],
|
||
|
|
delay_seconds=1.5,
|
||
|
|
)
|
||
|
|
# Secondary: returns 200 immediately.
|
||
|
|
secondary = MockServer([
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
(200, {}, SUCCESS_RESPONSE),
|
||
|
|
])
|
||
|
|
primary.start()
|
||
|
|
secondary.start()
|
||
|
|
config_path = None
|
||
|
|
|
||
|
|
try:
|
||
|
|
config_path = write_config(
|
||
|
|
"retry_it11_high_latency_failover.yaml",
|
||
|
|
{
|
||
|
|
"MOCK_PRIMARY_PORT": primary.port,
|
||
|
|
"MOCK_SECONDARY_PORT": secondary.port,
|
||
|
|
},
|
||
|
|
)
|
||
|
|
gateway_up(config_path)
|
||
|
|
|
||
|
|
# --- First request: triggers the latency block ---
|
||
|
|
# The primary will respond with 200 after ~1.5s delay.
|
||
|
|
# Since the response completes, the client gets the 200 back,
|
||
|
|
# but the gateway should record a Latency_Block_State entry.
|
||
|
|
resp1 = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json=CHAT_REQUEST_BODY,
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
assert resp1.status_code == 200, (
|
||
|
|
f"First request: expected 200 but got {resp1.status_code}: "
|
||
|
|
f"{resp1.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
primary_calls_after_first = primary.call_count
|
||
|
|
secondary_calls_after_first = secondary.call_count
|
||
|
|
|
||
|
|
# Primary should have been called once (slow 200).
|
||
|
|
assert primary_calls_after_first >= 1, (
|
||
|
|
"Primary should have been called for the first request"
|
||
|
|
)
|
||
|
|
|
||
|
|
# --- Second request: within the 60s latency block window ---
|
||
|
|
# The primary model should be latency-blocked globally, so the
|
||
|
|
# gateway should route to the secondary provider directly.
|
||
|
|
resp2 = requests.post(
|
||
|
|
GATEWAY_CHAT_URL,
|
||
|
|
json={
|
||
|
|
"model": "openai/gpt-4o",
|
||
|
|
"messages": [{"role": "user", "content": "Second request"}],
|
||
|
|
},
|
||
|
|
headers={"Authorization": "Bearer test-key"},
|
||
|
|
timeout=30,
|
||
|
|
)
|
||
|
|
assert resp2.status_code == 200, (
|
||
|
|
f"Second request: expected 200 but got {resp2.status_code}: "
|
||
|
|
f"{resp2.text}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: primary was NOT called again for the second request
|
||
|
|
# (it should be latency-blocked for 60s after the slow first response).
|
||
|
|
assert primary.call_count == primary_calls_after_first, (
|
||
|
|
f"Primary should not have been called for the second request "
|
||
|
|
f"(latency-blocked for 60s). Calls before: "
|
||
|
|
f"{primary_calls_after_first}, after: {primary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Assert: secondary handled the second request.
|
||
|
|
assert secondary.call_count > secondary_calls_after_first, (
|
||
|
|
f"Secondary should have handled the second request. "
|
||
|
|
f"Calls before: {secondary_calls_after_first}, "
|
||
|
|
f"after: {secondary.call_count}"
|
||
|
|
)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
gateway_down()
|
||
|
|
primary.stop()
|
||
|
|
secondary.stop()
|
||
|
|
if config_path and os.path.exists(config_path):
|
||
|
|
os.unlink(config_path)
|