""" Integration tests for retry-on-ratelimit feature (P0). Tests IT-1 through IT-6, IT-12, IT-13 validate end-to-end retry behavior through the real Plano gateway using Python mock HTTP servers as upstream providers. Each test: 1. Starts mock upstream servers on ephemeral ports 2. Writes a YAML config pointing the gateway at those mock ports 3. Starts the gateway via `planoai up` 4. Sends requests and asserts on response status/body/timing 5. Tears down the gateway via `planoai down` """ import json import logging import os import subprocess import sys import tempfile import threading import time from http.server import HTTPServer, BaseHTTPRequestHandler from typing import Optional import pytest import requests logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger(__name__) GATEWAY_BASE_URL = "http://localhost:12000" GATEWAY_CHAT_URL = f"{GATEWAY_BASE_URL}/v1/chat/completions" CONFIGS_DIR = os.path.join(os.path.dirname(__file__), "configs") # Standard OpenAI-compatible success response body SUCCESS_RESPONSE = json.dumps({ "id": "chatcmpl-test-001", "object": "chat.completion", "created": 1700000000, "model": "mock-model", "choices": [ { "index": 0, "message": { "role": "assistant", "content": "Hello from mock provider!", }, "finish_reason": "stop", } ], "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, }) # Standard chat request body CHAT_REQUEST_BODY = { "model": "openai/gpt-4o", "messages": [{"role": "user", "content": "Hello"}], } # --------------------------------------------------------------------------- # Mock upstream server infrastructure # --------------------------------------------------------------------------- class MockUpstreamHandler(BaseHTTPRequestHandler): """ Configurable mock HTTP handler that returns responses from a per-server queue. Each server instance has a response_queue (list of tuples): (status_code, headers_dict, body_string) Responses are consumed in order. When the queue is exhausted, the last response is repeated. The handler also records all received requests for later assertion. """ # These are set per-server-instance via the factory function below. response_queue: list = [] received_requests: list = [] call_count: int = 0 lock: threading.Lock = threading.Lock() def do_POST(self): content_length = int(self.headers.get("Content-Length", 0)) body = self.rfile.read(content_length) if content_length > 0 else b"" with self.__class__.lock: self.__class__.call_count += 1 self.__class__.received_requests.append({ "path": self.path, "headers": dict(self.headers), "body": body.decode("utf-8", errors="replace"), }) idx = min( self.__class__.call_count - 1, len(self.__class__.response_queue) - 1, ) status_code, headers, response_body = self.__class__.response_queue[idx] self.send_response(status_code) for key, value in headers.items(): self.send_header(key, value) self.send_header("Content-Type", "application/json") self.end_headers() if isinstance(response_body, str): response_body = response_body.encode("utf-8") self.wfile.write(response_body) def do_GET(self): """Handle health checks or other GET requests.""" self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(b'{"status": "ok"}') def log_message(self, format, *args): """Suppress default request logging to reduce noise.""" pass def create_mock_handler_class(response_queue: list) -> type: """ Create a new handler class with its own response queue and state. This avoids shared state between different mock servers. """ class Handler(MockUpstreamHandler): pass Handler.response_queue = list(response_queue) Handler.received_requests = [] Handler.call_count = 0 Handler.lock = threading.Lock() return Handler class MockServer: """Manages a mock HTTP server running in a background thread.""" def __init__(self, response_queue: list): self.handler_class = create_mock_handler_class(response_queue) self.server = HTTPServer(("0.0.0.0", 0), self.handler_class) self.port = self.server.server_address[1] self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) def start(self): self.thread.start() logger.info(f"Mock server started on port {self.port}") def stop(self): self.server.shutdown() self.thread.join(timeout=5) logger.info(f"Mock server stopped on port {self.port}") @property def call_count(self) -> int: return self.handler_class.call_count @property def received_requests(self) -> list: return self.handler_class.received_requests # --------------------------------------------------------------------------- # Gateway lifecycle helpers # --------------------------------------------------------------------------- def write_config(template_name: str, substitutions: dict) -> str: """ Read a config template from configs/ dir, apply port substitutions, and write to a temp file. Returns the path to the temp config file. """ template_path = os.path.join(CONFIGS_DIR, template_name) with open(template_path, "r") as f: content = f.read() for key, value in substitutions.items(): content = content.replace(f"${{{key}}}", str(value)) # Write to a temp file in the e2e directory so planoai can find it fd, config_path = tempfile.mkstemp(suffix=".yaml", prefix="retry_test_") with os.fdopen(fd, "w") as f: f.write(content) logger.info(f"Wrote test config to {config_path}") return config_path def gateway_up(config_path: str, timeout: int = 30): """Start the Plano gateway with the given config. Waits for health.""" logger.info(f"Starting gateway with config: {config_path}") subprocess.run( ["planoai", "down", "--docker"], capture_output=True, timeout=30, ) result = subprocess.run( ["planoai", "up", "--docker", config_path], capture_output=True, text=True, timeout=60, ) if result.returncode != 0: logger.error(f"planoai up failed: {result.stderr}") raise RuntimeError(f"planoai up failed: {result.stderr}") # Wait for gateway to be healthy start = time.time() while time.time() - start < timeout: try: resp = requests.get(f"{GATEWAY_BASE_URL}/healthz", timeout=2) if resp.status_code == 200: logger.info("Gateway is healthy") return except requests.ConnectionError: pass time.sleep(1) raise RuntimeError(f"Gateway did not become healthy within {timeout}s") def gateway_down(): """Stop the Plano gateway.""" logger.info("Stopping gateway") subprocess.run( ["planoai", "down", "--docker"], capture_output=True, timeout=30, ) def make_error_response(status_code: int, message: str = "error") -> str: """Create a JSON error response body.""" return json.dumps({ "error": { "message": message, "type": "server_error", "code": str(status_code), } }) # --------------------------------------------------------------------------- # Streaming helpers # --------------------------------------------------------------------------- STREAMING_SUCCESS_CHUNKS = [ 'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}\n\n', 'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"content":" from"},"finish_reason":null}]}\n\n', 'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{"content":" stream!"},"finish_reason":null}]}\n\n', 'data: {"id":"chatcmpl-stream-001","object":"chat.completion.chunk","created":1700000000,"model":"mock-model","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n', "data: [DONE]\n\n", ] class StreamingMockHandler(MockUpstreamHandler): """Handler that returns SSE streaming responses.""" pass def create_streaming_handler_class( response_queue: list, streaming_chunks: Optional[list] = None, ) -> type: """ Create a handler class that can return streaming SSE responses. response_queue entries can include a special "STREAM" body marker to trigger streaming mode with the provided chunks. """ chunks = streaming_chunks or STREAMING_SUCCESS_CHUNKS class Handler(StreamingMockHandler): pass Handler.response_queue = list(response_queue) Handler.received_requests = [] Handler.call_count = 0 Handler.lock = threading.Lock() original_do_post = Handler.do_POST def streaming_do_post(self): content_length = int(self.headers.get("Content-Length", 0)) body = self.rfile.read(content_length) if content_length > 0 else b"" with Handler.lock: Handler.call_count += 1 Handler.received_requests.append({ "path": self.path, "headers": dict(self.headers), "body": body.decode("utf-8", errors="replace"), }) idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1) status_code, headers, response_body = Handler.response_queue[idx] if response_body == "STREAM": self.send_response(status_code) for key, value in headers.items(): self.send_header(key, value) self.send_header("Content-Type", "text/event-stream") self.send_header("Transfer-Encoding", "chunked") self.end_headers() for chunk in chunks: self.wfile.write(chunk.encode("utf-8")) self.wfile.flush() time.sleep(0.05) else: self.send_response(status_code) for key, value in headers.items(): self.send_header(key, value) self.send_header("Content-Type", "application/json") self.end_headers() if isinstance(response_body, str): response_body = response_body.encode("utf-8") self.wfile.write(response_body) Handler.do_POST = streaming_do_post return Handler class StreamingMockServer: """Mock server that supports streaming responses.""" def __init__(self, response_queue: list, streaming_chunks: Optional[list] = None): self.handler_class = create_streaming_handler_class( response_queue, streaming_chunks ) self.server = HTTPServer(("0.0.0.0", 0), self.handler_class) self.port = self.server.server_address[1] self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) def start(self): self.thread.start() logger.info(f"Streaming mock server started on port {self.port}") def stop(self): self.server.shutdown() self.thread.join(timeout=5) @property def call_count(self) -> int: return self.handler_class.call_count @property def received_requests(self) -> list: return self.handler_class.received_requests # --------------------------------------------------------------------------- # Body-echo handler for IT-13 # --------------------------------------------------------------------------- def create_echo_handler_class(response_queue: list) -> type: """ Create a handler that echoes the received request body back in the response, wrapped in a valid chat completion response. The response_queue controls status codes — when the status is 200, the handler echoes the body; otherwise it returns the queued response. """ class Handler(MockUpstreamHandler): pass Handler.response_queue = list(response_queue) Handler.received_requests = [] Handler.call_count = 0 Handler.lock = threading.Lock() def echo_do_post(self): content_length = int(self.headers.get("Content-Length", 0)) body = self.rfile.read(content_length) if content_length > 0 else b"" with Handler.lock: Handler.call_count += 1 Handler.received_requests.append({ "path": self.path, "headers": dict(self.headers), "body": body.decode("utf-8", errors="replace"), }) idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1) status_code, headers, response_body = Handler.response_queue[idx] if status_code == 200: # Echo the received body inside a chat completion response echo_response = json.dumps({ "id": "chatcmpl-echo-001", "object": "chat.completion", "created": 1700000000, "model": "echo-model", "choices": [ { "index": 0, "message": { "role": "assistant", "content": body.decode("utf-8", errors="replace"), }, "finish_reason": "stop", } ], "usage": { "prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15, }, }) self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(echo_response.encode("utf-8")) else: self.send_response(status_code) for key, value in headers.items(): self.send_header(key, value) self.send_header("Content-Type", "application/json") self.end_headers() if isinstance(response_body, str): response_body = response_body.encode("utf-8") self.wfile.write(response_body) Handler.do_POST = echo_do_post return Handler class EchoMockServer: """Mock server that echoes request body on 200 responses.""" def __init__(self, response_queue: list): self.handler_class = create_echo_handler_class(response_queue) self.server = HTTPServer(("0.0.0.0", 0), self.handler_class) self.port = self.server.server_address[1] self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) def start(self): self.thread.start() logger.info(f"Echo mock server started on port {self.port}") def stop(self): self.server.shutdown() self.thread.join(timeout=5) @property def call_count(self) -> int: return self.handler_class.call_count @property def received_requests(self) -> list: return self.handler_class.received_requests # --------------------------------------------------------------------------- # Delayed-response handler for IT-10 (timeout triggers retry) # --------------------------------------------------------------------------- def create_delayed_handler_class(response_queue: list, delay_seconds: float) -> type: """ Create a handler class that delays its response by *delay_seconds* before sending the queued response. Used to simulate upstream timeouts. """ class Handler(MockUpstreamHandler): pass Handler.response_queue = list(response_queue) Handler.received_requests = [] Handler.call_count = 0 Handler.lock = threading.Lock() def delayed_do_post(self): content_length = int(self.headers.get("Content-Length", 0)) body = self.rfile.read(content_length) if content_length > 0 else b"" with Handler.lock: Handler.call_count += 1 Handler.received_requests.append({ "path": self.path, "headers": dict(self.headers), "body": body.decode("utf-8", errors="replace"), }) idx = min(Handler.call_count - 1, len(Handler.response_queue) - 1) status_code, headers, response_body = Handler.response_queue[idx] # Delay before responding — gateway should time out before this completes time.sleep(delay_seconds) self.send_response(status_code) for key, value in headers.items(): self.send_header(key, value) self.send_header("Content-Type", "application/json") self.end_headers() if isinstance(response_body, str): response_body = response_body.encode("utf-8") self.wfile.write(response_body) Handler.do_POST = delayed_do_post return Handler class DelayedMockServer: """Mock server that delays responses to simulate slow upstreams / timeouts.""" def __init__(self, response_queue: list, delay_seconds: float): self.handler_class = create_delayed_handler_class( response_queue, delay_seconds ) self.server = HTTPServer(("0.0.0.0", 0), self.handler_class) self.port = self.server.server_address[1] self.thread = threading.Thread(target=self.server.serve_forever, daemon=True) def start(self): self.thread.start() logger.info(f"Delayed mock server started on port {self.port} ") def stop(self): self.server.shutdown() self.thread.join(timeout=5) @property def call_count(self) -> int: return self.handler_class.call_count @property def received_requests(self) -> list: return self.handler_class.received_requests # =========================================================================== # Integration Tests # =========================================================================== class TestRetryIntegration: """ P0 integration tests for retry-on-ratelimit feature. These tests require the full gateway infrastructure (Docker, planoai CLI). Each test starts mock servers, configures the gateway, sends requests, and validates retry behavior end-to-end. """ def test_it1_basic_retry_on_429(self): """ IT-1: Basic retry on 429. Primary mock returns 429, secondary returns 200. Assert client gets 200 from the secondary provider. """ # Setup mock servers primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: # Write config with actual ports config_path = write_config("retry_it1_basic_429.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }) # Start gateway gateway_up(config_path) # Send request resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) # Assert: client gets 200 from secondary assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) body = resp.json() assert "choices" in body assert body["choices"][0]["message"]["content"] == "Hello from mock provider!" # Assert: primary was called (got 429), secondary was called (returned 200) assert primary.call_count >= 1, "Primary should have been called" assert secondary.call_count >= 1, "Secondary should have been called" finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it2_retry_on_503_different_provider(self): """ IT-2: Retry on 503 with different_provider strategy. Primary returns 503, secondary returns 200. Assert client gets 200 from the secondary provider. """ primary = MockServer([ (503, {}, make_error_response(503, "Service Unavailable")), ]) secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: config_path = write_config("retry_it2_503_different_provider.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) body = resp.json() assert "choices" in body assert primary.call_count >= 1 assert secondary.call_count >= 1 finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it3_all_retries_exhausted(self): """ IT-3: All retries exhausted. All mock providers return 429. Assert client gets an error response with attempts list and total_attempts. """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) secondary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) primary.start() secondary.start() config_path = None try: config_path = write_config("retry_it3_all_exhausted.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) # Should get an error response (429 or the gateway's retry_exhausted error) assert resp.status_code >= 400, ( f"Expected error status but got {resp.status_code}" ) body = resp.json() # The error response should contain retry attempt details error = body.get("error", {}) assert error.get("type") == "retry_exhausted", ( f"Expected retry_exhausted error type, got: {error}" ) assert "attempts" in error, "Error should contain attempts list" assert "total_attempts" in error, "Error should contain total_attempts" assert error["total_attempts"] >= 2, ( f"Expected at least 2 total attempts, got {error['total_attempts']}" ) finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it4_no_retry_policy_no_retry(self): """ IT-4: No retry_policy → no retry. Primary returns 429 with no retry_policy configured. Assert client gets 429 directly (no retry to secondary). """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: config_path = write_config("retry_it4_no_retry_policy.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) # Should get 429 directly — no retry assert resp.status_code == 429, ( f"Expected 429 but got {resp.status_code}: {resp.text}" ) # Secondary should NOT have been called assert secondary.call_count == 0, ( f"Secondary should not be called without retry_policy, " f"but was called {secondary.call_count} times" ) finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it5_max_attempts_respected(self): """ IT-5: max_attempts respected. Primary returns 429, max_attempts: 1. Assert only 1 retry attempt is made, then error is returned. The secondary also returns 429 to ensure we see the exhaustion. """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) secondary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) tertiary = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() tertiary.start() config_path = None try: config_path = write_config("retry_it5_max_attempts.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, "MOCK_TERTIARY_PORT": tertiary.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) # With max_attempts: 1, only 1 retry should happen after the initial failure. # Primary fails (429) → 1 retry to secondary (429) → exhausted. # Tertiary should NOT be reached. assert resp.status_code >= 400, ( f"Expected error status but got {resp.status_code}" ) assert tertiary.call_count == 0, ( f"Tertiary should not be called with max_attempts=1, " f"but was called {tertiary.call_count} times" ) # Total calls: primary (1) + secondary (1 retry) = 2 total_calls = primary.call_count + secondary.call_count assert total_calls <= 2, ( f"Expected at most 2 total calls (1 original + 1 retry), " f"got {total_calls}" ) finally: gateway_down() primary.stop() secondary.stop() tertiary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it6_backoff_delay_observed(self): """ IT-6: Backoff delay observed. Configure same_model strategy with backoff (base_ms: 500, jitter: false). Primary returns 429 twice, then 200 on third attempt. Assert total response time includes backoff delays. With base_ms=500 and no jitter: - Attempt 1: fail (429) - Backoff: 500ms (500 * 2^0) - Attempt 2: fail (429) - Backoff: 1000ms (500 * 2^1) - Attempt 3: success (200) Total backoff >= 1500ms (500 + 1000) """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), (429, {}, make_error_response(429, "Rate limit exceeded")), (200, {}, SUCCESS_RESPONSE), ]) primary.start() config_path = None try: config_path = write_config("retry_it6_backoff_delay.yaml", { "MOCK_PRIMARY_PORT": primary.port, }) gateway_up(config_path) start_time = time.time() resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=60, ) elapsed = time.time() - start_time assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) # With base_ms=500 and no jitter, backoff should be at least: # 500ms (attempt 1→2) + 1000ms (attempt 2→3) = 1500ms # Use a slightly lower threshold (1.0s) to account for timing variance min_expected_delay = 1.0 # seconds assert elapsed >= min_expected_delay, ( f"Expected response time >= {min_expected_delay}s due to backoff, " f"but got {elapsed:.2f}s" ) # Primary should have been called 3 times assert primary.call_count == 3, ( f"Expected 3 calls to primary, got {primary.call_count}" ) finally: gateway_down() primary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it12_streaming_preserved_across_retry(self): """ IT-12: Streaming request preserved across retry. Primary returns 429, secondary returns 200 with SSE streaming. Assert client receives a streamed response. """ # Primary always returns 429 primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) # Secondary returns streaming 200 secondary_handler = create_streaming_handler_class([ (200, {}, "STREAM"), ]) secondary_server = HTTPServer(("0.0.0.0", 0), secondary_handler) secondary_port = secondary_server.server_address[1] secondary_thread = threading.Thread( target=secondary_server.serve_forever, daemon=True ) primary.start() secondary_thread.start() logger.info(f"Streaming secondary mock started on port {secondary_port}") config_path = None try: config_path = write_config("retry_it12_streaming.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary_port, }) gateway_up(config_path) # Send a streaming request streaming_body = dict(CHAT_REQUEST_BODY) streaming_body["stream"] = True resp = requests.post( GATEWAY_CHAT_URL, json=streaming_body, headers={"Authorization": "Bearer test-key"}, stream=True, timeout=30, ) assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) # Collect streamed chunks chunks = [] for line in resp.iter_lines(decode_unicode=True): if line: chunks.append(line) # Should have received SSE data chunks assert len(chunks) > 0, "Should have received streaming chunks" # Verify at least one chunk contains "data:" prefix (SSE format) data_chunks = [c for c in chunks if c.startswith("data:")] assert len(data_chunks) > 0, ( f"Expected SSE data chunks, got: {chunks}" ) # Verify the stream contains expected content content_found = False for chunk in data_chunks: if chunk == "data: [DONE]": continue try: payload = json.loads(chunk[len("data: "):]) delta = payload.get("choices", [{}])[0].get("delta", {}) if delta.get("content"): content_found = True except (json.JSONDecodeError, IndexError): pass assert content_found, "Should have received content in streaming chunks" # Primary should have been called (got 429) assert primary.call_count >= 1, "Primary should have been called" finally: gateway_down() primary.stop() secondary_server.shutdown() secondary_thread.join(timeout=5) if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it13_request_body_preserved_across_retry(self): """ IT-13: Request body preserved across retry. Primary returns 429, secondary echoes the request body. Assert the echoed body matches the original request. """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) # Secondary echoes the request body echo_server = EchoMockServer([ (200, {}, ""), # Status 200 triggers echo behavior ]) primary.start() echo_server.start() config_path = None try: config_path = write_config("retry_it13_body_preserved.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": echo_server.port, }) gateway_up(config_path) # Send request with a distinctive body request_body = { "model": "openai/gpt-4o", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me about retry mechanisms."}, ], "temperature": 0.7, "max_tokens": 100, } resp = requests.post( GATEWAY_CHAT_URL, json=request_body, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) # The echo server received the request body — verify it was preserved assert echo_server.call_count >= 1, "Echo server should have been called" # Parse the body that the echo server received received_body_str = echo_server.received_requests[-1]["body"] received_body = json.loads(received_body_str) # The gateway may modify the model field when routing to a different # provider, but the messages and other fields should be preserved assert received_body.get("messages") is not None, ( "Messages should be preserved in the forwarded request" ) # Verify the user message content is preserved user_messages = [ m for m in received_body["messages"] if m.get("role") == "user" ] assert len(user_messages) > 0, "User messages should be preserved" assert user_messages[-1]["content"] == "Tell me about retry mechanisms.", ( f"User message content should be preserved, got: {user_messages[-1]}" ) # Primary should have been called (got 429) assert primary.call_count >= 1, "Primary should have been called" finally: gateway_down() primary.stop() echo_server.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) # ----------------------------------------------------------------------- # P1 Integration Tests (IT-7 through IT-10) # ----------------------------------------------------------------------- def test_it7_fallback_models_priority(self): """ IT-7: Fallback models priority. Primary mock returns 429, fallback[0] returns 429, fallback[1] returns 200. Assert client gets 200 from fallback[1] and providers are tried in the order defined by fallback_models. Config: fallback_models: [anthropic/claude-3-5-sonnet, mistral/mistral-large] """ primary = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) fallback1 = MockServer([ (429, {}, make_error_response(429, "Rate limit exceeded")), ]) fallback2 = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() fallback1.start() fallback2.start() config_path = None try: config_path = write_config("retry_it7_fallback_priority.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_FALLBACK1_PORT": fallback1.port, "MOCK_FALLBACK2_PORT": fallback2.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) # Assert: client gets 200 from fallback[1] assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) body = resp.json() assert "choices" in body assert body["choices"][0]["message"]["content"] == "Hello from mock provider!" # Assert: providers tried in order — primary, fallback[0], fallback[1] assert primary.call_count >= 1, "Primary should have been called first" assert fallback1.call_count >= 1, ( "Fallback[0] (anthropic/claude-3-5-sonnet) should have been tried " "before fallback[1]" ) assert fallback2.call_count >= 1, ( "Fallback[1] (mistral/mistral-large) should have been called" ) finally: gateway_down() primary.stop() fallback1.stop() fallback2.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it8_retry_after_header_honored(self): """ IT-8: Retry-After header honored. Primary returns 429 + Retry-After: 2 on the first call, then 200 on the second call (same_model strategy). Assert the total response time is >= 2 seconds, proving the gateway waited for the Retry-After duration. """ primary = MockServer([ (429, {"Retry-After": "2"}, make_error_response(429, "Rate limit exceeded")), (200, {}, SUCCESS_RESPONSE), ]) primary.start() config_path = None try: config_path = write_config("retry_it8_retry_after_honored.yaml", { "MOCK_PRIMARY_PORT": primary.port, }) gateway_up(config_path) start_time = time.time() resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) elapsed = time.time() - start_time # Assert: client gets 200 after the retry assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) body = resp.json() assert "choices" in body # Assert: total time >= 2 seconds (Retry-After: 2 was honored) # Use a slightly lower threshold to account for timing variance min_expected_delay = 1.8 # seconds assert elapsed >= min_expected_delay, ( f"Expected response time >= {min_expected_delay}s due to " f"Retry-After: 2, but got {elapsed:.2f}s" ) # Primary should have been called twice (429 then 200) assert primary.call_count == 2, ( f"Expected 2 calls to primary (429 + 200), got {primary.call_count}" ) finally: gateway_down() primary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it9_retry_after_blocks_initial_selection(self): """ IT-9: Retry-After blocks initial selection. First request: primary returns 429 + Retry-After: 60 and the gateway retries to the secondary (which returns 200). Second request (sent within 60s): because the primary is globally blocked by the Retry-After state, the gateway should route directly to the alternative provider without hitting the primary again. """ # Primary: first call returns 429 + Retry-After: 60, subsequent calls # return 200 (but should not be reached for the second request). primary = MockServer([ (429, {"Retry-After": "60"}, make_error_response(429, "Rate limit exceeded")), (200, {}, SUCCESS_RESPONSE), (200, {}, SUCCESS_RESPONSE), ]) secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: config_path = write_config( "retry_it9_retry_after_blocks_selection.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }, ) gateway_up(config_path) # --- First request: triggers the Retry-After state --- resp1 = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp1.status_code == 200, ( f"First request: expected 200 but got {resp1.status_code}: {resp1.text}" ) primary_calls_after_first = primary.call_count secondary_calls_after_first = secondary.call_count # Primary should have been called once (got 429), secondary once (got 200) assert primary_calls_after_first >= 1, ( "Primary should have been called for the first request" ) assert secondary_calls_after_first >= 1, ( "Secondary should have been called as fallback for the first request" ) # --- Second request: within the 60s Retry-After window --- # The primary model should be blocked globally, so the gateway # should route to the alternative provider directly. resp2 = requests.post( GATEWAY_CHAT_URL, json={ "model": "openai/gpt-4o", "messages": [{"role": "user", "content": "Second request"}], }, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp2.status_code == 200, ( f"Second request: expected 200 but got {resp2.status_code}: {resp2.text}" ) # Assert: primary was NOT called again for the second request # (it should still be blocked by the 60s Retry-After) assert primary.call_count == primary_calls_after_first, ( f"Primary should not have been called for the second request " f"(blocked by Retry-After: 60). Calls before: " f"{primary_calls_after_first}, after: {primary.call_count}" ) # Assert: secondary handled the second request assert secondary.call_count > secondary_calls_after_first, ( f"Secondary should have handled the second request. " f"Calls before: {secondary_calls_after_first}, " f"after: {secondary.call_count}" ) finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it10_timeout_triggers_retry(self): """ IT-10: Timeout triggers retry. Primary mock delays its response beyond the gateway's request timeout. Secondary returns 200 immediately. Assert client gets 200 from the secondary provider. """ # Primary delays 120 seconds — well beyond any reasonable gateway timeout. # The gateway should time out and retry to the secondary. primary = DelayedMockServer( response_queue=[ (200, {}, SUCCESS_RESPONSE), ], delay_seconds=120, ) secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: config_path = write_config("retry_it10_timeout_triggers_retry.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }) gateway_up(config_path) resp = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=120, ) # Assert: client gets 200 from the secondary assert resp.status_code == 200, ( f"Expected 200 but got {resp.status_code}: {resp.text}" ) body = resp.json() assert "choices" in body assert body["choices"][0]["message"]["content"] == "Hello from mock provider!" # Assert: primary was called (timed out), secondary was called (returned 200) assert primary.call_count >= 1, ( "Primary should have been called (and timed out)" ) assert secondary.call_count >= 1, ( "Secondary should have been called after primary timed out" ) finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path) def test_it11_high_latency_proactive_failover(self): """ IT-11: High latency proactive failover. First request: primary mock delays response by ~1.5s (threshold_ms=1000 + 500ms buffer) but completes with 200 OK. The client receives the slow 200 response (completed responses are always delivered). However, the gateway records a Latency_Block_State for the primary model. Second request: sent immediately after the first. Because the primary is now latency-blocked (block_duration_seconds=60, min_triggers=1), the gateway should route directly to the secondary provider. Config: on_high_latency with min_triggers: 1, threshold_ms: 1000, block_duration_seconds: 60, measure: "total", scope: "model", apply_to: "global". """ # Primary: delays 1.5s (exceeds 1000ms threshold), returns 200. # Queue two responses in case the primary is called twice (it shouldn't # be for the second request, but we need a response ready just in case). primary = DelayedMockServer( response_queue=[ (200, {}, SUCCESS_RESPONSE), (200, {}, SUCCESS_RESPONSE), ], delay_seconds=1.5, ) # Secondary: returns 200 immediately. secondary = MockServer([ (200, {}, SUCCESS_RESPONSE), (200, {}, SUCCESS_RESPONSE), ]) primary.start() secondary.start() config_path = None try: config_path = write_config( "retry_it11_high_latency_failover.yaml", { "MOCK_PRIMARY_PORT": primary.port, "MOCK_SECONDARY_PORT": secondary.port, }, ) gateway_up(config_path) # --- First request: triggers the latency block --- # The primary will respond with 200 after ~1.5s delay. # Since the response completes, the client gets the 200 back, # but the gateway should record a Latency_Block_State entry. resp1 = requests.post( GATEWAY_CHAT_URL, json=CHAT_REQUEST_BODY, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp1.status_code == 200, ( f"First request: expected 200 but got {resp1.status_code}: " f"{resp1.text}" ) primary_calls_after_first = primary.call_count secondary_calls_after_first = secondary.call_count # Primary should have been called once (slow 200). assert primary_calls_after_first >= 1, ( "Primary should have been called for the first request" ) # --- Second request: within the 60s latency block window --- # The primary model should be latency-blocked globally, so the # gateway should route to the secondary provider directly. resp2 = requests.post( GATEWAY_CHAT_URL, json={ "model": "openai/gpt-4o", "messages": [{"role": "user", "content": "Second request"}], }, headers={"Authorization": "Bearer test-key"}, timeout=30, ) assert resp2.status_code == 200, ( f"Second request: expected 200 but got {resp2.status_code}: " f"{resp2.text}" ) # Assert: primary was NOT called again for the second request # (it should be latency-blocked for 60s after the slow first response). assert primary.call_count == primary_calls_after_first, ( f"Primary should not have been called for the second request " f"(latency-blocked for 60s). Calls before: " f"{primary_calls_after_first}, after: {primary.call_count}" ) # Assert: secondary handled the second request. assert secondary.call_count > secondary_calls_after_first, ( f"Secondary should have handled the second request. " f"Calls before: {secondary_calls_after_first}, " f"after: {secondary.call_count}" ) finally: gateway_down() primary.stop() secondary.stop() if config_path and os.path.exists(config_path): os.unlink(config_path)