diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs index b049e4f3..3b95b15f 100644 --- a/crates/brightstaff/src/handlers/chat_completions.rs +++ b/crates/brightstaff/src/handlers/chat_completions.rs @@ -203,9 +203,10 @@ pub async fn chat( } }; - // copy over the headers from the original response + // copy over the headers and status code from the original response let response_headers = llm_response.headers().clone(); - let mut response = Response::builder(); + let upstream_status = llm_response.status(); + let mut response = Response::builder().status(upstream_status); let headers = response.headers_mut().unwrap(); for (header_name, header_value) in response_headers.iter() { headers.insert(header_name, header_value.clone()); diff --git a/crates/hermesllm/src/clients/transformer.rs b/crates/hermesllm/src/clients/transformer.rs index b907c243..0ec06847 100644 --- a/crates/hermesllm/src/clients/transformer.rs +++ b/crates/hermesllm/src/clients/transformer.rs @@ -102,7 +102,7 @@ impl TryFrom for ChatCompletionsRequest { messages: openai_messages, temperature: req.temperature, top_p: req.top_p, - max_tokens: Some(req.max_tokens), + max_completion_tokens: Some(req.max_tokens), stream: req.stream, stop: req.stop_sequences, tools: openai_tools, @@ -142,7 +142,9 @@ impl TryFrom for AnthropicMessagesRequest { model: req.model, system: system_prompt, messages, - max_tokens: req.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS), + max_tokens: req.max_completion_tokens + .or(req.max_tokens) + .unwrap_or(DEFAULT_MAX_TOKENS), container: None, mcp_servers: None, service_tier: None, @@ -1079,7 +1081,7 @@ mod tests { assert_eq!(openai_req.model, "claude-3-sonnet-20240229"); assert_eq!(openai_req.messages.len(), 2); // system + user message - assert_eq!(openai_req.max_tokens, Some(1024)); + assert_eq!(openai_req.max_completion_tokens, Some(1024)); assert_eq!(openai_req.temperature, Some(0.7)); assert_eq!(openai_req.top_p, Some(0.9)); assert_eq!(openai_req.stream, Some(false)); diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs index adde81f4..3603edf2 100644 --- a/crates/hermesllm/src/providers/request.rs +++ b/crates/hermesllm/src/providers/request.rs @@ -360,6 +360,9 @@ mod tests { assert_eq!(openai_req.model, openai_req2.model); assert_eq!(openai_req.messages[0].role, openai_req2.messages[0].role); assert_eq!(openai_req.messages[0].content.extract_text(), openai_req2.messages[0].content.extract_text()); - assert_eq!(openai_req.max_tokens, openai_req2.max_tokens); + // After roundtrip, deprecated max_tokens should be converted to max_completion_tokens + let original_max_tokens = openai_req.max_completion_tokens.or(openai_req.max_tokens); + let roundtrip_max_tokens = openai_req2.max_completion_tokens.or(openai_req2.max_tokens); + assert_eq!(original_max_tokens, roundtrip_max_tokens); } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index b1651e0c..12054ccb 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -45,6 +45,8 @@ pub struct StreamContext { traces_queue: Arc>>, overrides: Rc>, user_message: Option, + /// Store upstream response status code to handle error responses gracefully + upstream_status_code: Option, } impl StreamContext { @@ -72,6 +74,7 @@ impl StreamContext { traces_queue, request_body_sent_time: None, user_message: None, + upstream_status_code: None, } } @@ -871,6 +874,19 @@ impl HttpContext for StreamContext { } fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { + // Capture the upstream response status code to handle errors appropriately + if let Some(status_str) = self.get_http_response_header(":status") { + if let Ok(status_code) = status_str.parse::() { + self.upstream_status_code = StatusCode::from_u16(status_code).ok(); + + info!( + "[ARCHGW_REQ_ID:{}] UPSTREAM_RESPONSE_STATUS: {}", + self.request_identifier(), + status_code + ); + } + } + self.remove_http_response_header("content-length"); self.remove_http_response_header("content-encoding"); @@ -888,6 +904,32 @@ impl HttpContext for StreamContext { return Action::Continue; } + // Check if this is an error response from upstream + if let Some(status_code) = &self.upstream_status_code { + if status_code.is_client_error() || status_code.is_server_error() { + info!( + "[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_RESPONSE: status={} body_size={}", + self.request_identifier(), + status_code.as_u16(), + body_size + ); + + // For error responses, forward the upstream error directly without parsing + if body_size > 0 { + if let Ok(body) = self.read_raw_response_body(body_size) { + debug!( + "[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_BODY: {}", + self.request_identifier(), + String::from_utf8_lossy(&body) + ); + // Forward the error response as-is + self.set_http_response_body(0, body_size, &body); + } + } + return Action::Continue; + } + } + match self.client_api { Some(SupportedAPIs::OpenAIChatCompletions(_)) => {} Some(SupportedAPIs::AnthropicMessagesAPI(_)) => {} diff --git a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml index 49245d55..c829b395 100644 --- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml +++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml @@ -10,10 +10,13 @@ listeners: llm_providers: # OpenAI Models - - model: openai/gpt-4o-mini + - model: openai/gpt-5-mini-2025-08-07 access_key: $OPENAI_API_KEY default: true + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + - model: openai/o3 access_key: $OPENAI_API_KEY @@ -41,7 +44,7 @@ llm_providers: model_aliases: # Alias for summarization tasks -> fast/cheap model arch.summarize.v1: - target: gpt-4o-mini + target: gpt-5-mini-2025-08-07 # Alias for general purpose tasks -> latest model arch.v1: @@ -61,7 +64,7 @@ model_aliases: # Semantic aliases summary-model: - target: gpt-4o-mini + target: gpt-5-mini-2025-08-07 chat-model: target: llama3.1 diff --git a/tests/e2e/test_model_alias_routing.py b/tests/e2e/test_model_alias_routing.py index c0f6f7c5..696507f2 100644 --- a/tests/e2e/test_model_alias_routing.py +++ b/tests/e2e/test_model_alias_routing.py @@ -33,8 +33,8 @@ def test_openai_client_with_alias_arch_summarize_v1(): ) completion = client.chat.completions.create( - model="arch.summarize.v1", # This should resolve to 4o-mini - max_tokens=50, + model="arch.summarize.v1", # This should resolve to 5o-mini + max_completion_tokens=500, # Increased token limit to avoid truncation and because the 5o-mini uses reasoning tokens messages=[ { "role": "user", @@ -60,7 +60,7 @@ def test_openai_client_with_alias_arch_v1(): completion = client.chat.completions.create( model="arch.v1", # This should resolve to gpt-o3 - max_tokens=50, + max_completion_tokens=500, messages=[ { "role": "user", @@ -82,8 +82,8 @@ def test_anthropic_client_with_alias_arch_summarize_v1(): client = anthropic.Anthropic(api_key="test-key", base_url=base_url) message = client.messages.create( - model="arch.summarize.v1", # This should resolve to 4o-mini - max_tokens=50, + model="arch.summarize.v1", # This should resolve to 5o-mini + max_tokens=500, messages=[ { "role": "user", @@ -108,7 +108,7 @@ def test_anthropic_client_with_alias_arch_v1(): message = client.messages.create( model="arch.v1", # This should resolve to o3 - max_tokens=50, + max_tokens=500, messages=[ { "role": "user", @@ -135,8 +135,8 @@ def test_openai_client_with_alias_streaming(): ) stream = client.chat.completions.create( - model="arch.summarize.v1", # This should resolve to 4o-mini - max_tokens=50, + model="arch.summarize.v1", # This should resolve to 5o-mini + max_completion_tokens=500, messages=[ { "role": "user", @@ -166,8 +166,8 @@ def test_anthropic_client_with_alias_streaming(): client = anthropic.Anthropic(api_key="test-key", base_url=base_url) with client.messages.stream( - model="arch.summarize.v1", # This should resolve to 4o-mini - max_tokens=50, + model="arch.summarize.v1", # This should resolve to 5o-mini + max_tokens=500, messages=[ { "role": "user", @@ -184,6 +184,89 @@ def test_anthropic_client_with_alias_streaming(): assert full_text == "Hello from streaming alias via Anthropic!" +def test_400_error_handling_with_alias(): + """Test that 400 errors from upstream are properly returned by archgw""" + logger.info( + "Testing 400 error handling with arch.summarize.v1 and invalid parameter" + ) + + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + client = openai.OpenAI( + api_key="test-key", + base_url=f"{base_url}/v1", + ) + + try: + completion = client.chat.completions.create( + model="arch.summarize.v1", # This should resolve to gpt-5-mini-2025-08-07 + max_completion_tokens=50, + temperature=0.7, # This is a typo - should be "temperature", which should trigger a 400 error + messages=[ + { + "role": "user", + "content": "Hello, this should trigger a 400 error due to invalid parameter name", + } + ], + ) + # If we reach here, the request unexpectedly succeeded + logger.error( + f"Expected 400 error but got successful response: {completion.choices[0].message.content}" + ) + assert False, "Expected 400 error but request succeeded" + except openai.BadRequestError as e: + # This is what we expect - a 400 Bad Request error + logger.info(f"Correctly received 400 Bad Request error: {e}") + assert e.status_code == 400, f"Expected status code 400, got {e.status_code}" + logger.info("✓ 400 error handling working correctly") + except Exception as e: + # Any other exception is unexpected + logger.error( + f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}" + ) + assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}" + + +def test_400_error_handling_unsupported_parameter(): + """Test that 400 errors for unsupported parameters are properly returned by archgw""" + logger.info("Testing 400 error handling with unsupported max_tokens parameter") + + base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "") + client = openai.OpenAI( + api_key="test-key", + base_url=f"{base_url}/v1", + ) + + try: + # Use the deprecated max_tokens parameter which should trigger a 400 error + completion = client.chat.completions.create( + model="arch.summarize.v1", # This should resolve to gpt-5-mini-2025-08-07 + max_tokens=150, # This parameter is unsupported for newer models, should use max_completion_tokens + messages=[ + { + "role": "user", + "content": "Hello, this should trigger a 400 error due to unsupported max_tokens parameter", + } + ], + ) + # If we reach here, the request unexpectedly succeeded + logger.error( + f"Expected 400 error but got successful response: {completion.choices[0].message.content}" + ) + assert False, "Expected 400 error but request succeeded" + except openai.BadRequestError as e: + # This is what we expect - a 400 Bad Request error + logger.info(f"Correctly received 400 Bad Request error: {e}") + assert e.status_code == 400, f"Expected status code 400, got {e.status_code}" + assert "max_tokens" in str(e), "Expected error message to mention max_tokens" + logger.info("✓ 400 error handling for unsupported parameters working correctly") + except Exception as e: + # Any other exception is unexpected + logger.error( + f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}" + ) + assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}" + + def test_nonexistent_alias(): """Test that using a non-existent alias falls back to treating it as a direct model name""" logger.info( @@ -199,7 +282,7 @@ def test_nonexistent_alias(): try: completion = client.chat.completions.create( model="nonexistent.alias", # This alias doesn't exist - max_tokens=50, + max_completion_tokens=50, messages=[ { "role": "user", @@ -231,8 +314,8 @@ def test_direct_model_4o_mini_openai(): ) completion = client.chat.completions.create( - model="4o-mini", # Direct model name - max_tokens=50, + model="gpt-4o-mini", # Direct model name + max_completion_tokens=50, messages=[ { "role": "user", @@ -254,7 +337,7 @@ def test_direct_model_4o_mini_anthropic(): client = anthropic.Anthropic(api_key="test-key", base_url=base_url) message = client.messages.create( - model="4o-mini", # Direct model name + model="gpt-4o-mini", # Direct model name max_tokens=50, messages=[ {