diff --git a/crates/brightstaff/src/handlers/chat_completions.rs b/crates/brightstaff/src/handlers/chat_completions.rs
index b049e4f3..3b95b15f 100644
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@@ -203,9 +203,10 @@ pub async fn chat(
         }
     };
 
-    // copy over the headers from the original response
+    // copy over the headers and status code from the original response
     let response_headers = llm_response.headers().clone();
-    let mut response = Response::builder();
+    let upstream_status = llm_response.status();
+    let mut response = Response::builder().status(upstream_status);
     let headers = response.headers_mut().unwrap();
     for (header_name, header_value) in response_headers.iter() {
         headers.insert(header_name, header_value.clone());
diff --git a/crates/hermesllm/src/clients/transformer.rs b/crates/hermesllm/src/clients/transformer.rs
index b907c243..0ec06847 100644
--- a/crates/hermesllm/src/clients/transformer.rs
+++ b/crates/hermesllm/src/clients/transformer.rs
@@ -102,7 +102,7 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
             messages: openai_messages,
             temperature: req.temperature,
             top_p: req.top_p,
-            max_tokens: Some(req.max_tokens),
+            max_completion_tokens: Some(req.max_tokens),
             stream: req.stream,
             stop: req.stop_sequences,
             tools: openai_tools,
@@ -142,7 +142,9 @@ impl TryFrom<ChatCompletionsRequest> for AnthropicMessagesRequest {
             model: req.model,
             system: system_prompt,
             messages,
-            max_tokens: req.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS),
+            max_tokens: req.max_completion_tokens
+                .or(req.max_tokens)
+                .unwrap_or(DEFAULT_MAX_TOKENS),
             container: None,
             mcp_servers: None,
             service_tier: None,
@@ -1079,7 +1081,7 @@ mod tests {
 
         assert_eq!(openai_req.model, "claude-3-sonnet-20240229");
         assert_eq!(openai_req.messages.len(), 2); // system + user message
-        assert_eq!(openai_req.max_tokens, Some(1024));
+        assert_eq!(openai_req.max_completion_tokens, Some(1024));
         assert_eq!(openai_req.temperature, Some(0.7));
         assert_eq!(openai_req.top_p, Some(0.9));
         assert_eq!(openai_req.stream, Some(false));
diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs
index adde81f4..3603edf2 100644
--- a/crates/hermesllm/src/providers/request.rs
+++ b/crates/hermesllm/src/providers/request.rs
@@ -360,6 +360,9 @@ mod tests {
             assert_eq!(openai_req.model, openai_req2.model);
             assert_eq!(openai_req.messages[0].role, openai_req2.messages[0].role);
             assert_eq!(openai_req.messages[0].content.extract_text(), openai_req2.messages[0].content.extract_text());
-            assert_eq!(openai_req.max_tokens, openai_req2.max_tokens);
+            // After roundtrip, deprecated max_tokens should be converted to max_completion_tokens
+            let original_max_tokens = openai_req.max_completion_tokens.or(openai_req.max_tokens);
+            let roundtrip_max_tokens = openai_req2.max_completion_tokens.or(openai_req2.max_tokens);
+            assert_eq!(original_max_tokens, roundtrip_max_tokens);
         }
 }
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index b1651e0c..12054ccb 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -45,6 +45,8 @@ pub struct StreamContext {
     traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
     overrides: Rc<Option<Overrides>>,
     user_message: Option<String>,
+    /// Store upstream response status code to handle error responses gracefully
+    upstream_status_code: Option<StatusCode>,
 }
 
 impl StreamContext {
@@ -72,6 +74,7 @@ impl StreamContext {
             traces_queue,
             request_body_sent_time: None,
             user_message: None,
+            upstream_status_code: None,
         }
     }
 
@@ -871,6 +874,19 @@ impl HttpContext for StreamContext {
     }
 
     fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
+        // Capture the upstream response status code to handle errors appropriately
+        if let Some(status_str) = self.get_http_response_header(":status") {
+            if let Ok(status_code) = status_str.parse::<u16>() {
+                self.upstream_status_code = StatusCode::from_u16(status_code).ok();
+
+                info!(
+                    "[ARCHGW_REQ_ID:{}] UPSTREAM_RESPONSE_STATUS: {}",
+                    self.request_identifier(),
+                    status_code
+                );
+            }
+        }
+
         self.remove_http_response_header("content-length");
         self.remove_http_response_header("content-encoding");
 
@@ -888,6 +904,32 @@ impl HttpContext for StreamContext {
             return Action::Continue;
         }
 
+        // Check if this is an error response from upstream
+        if let Some(status_code) = &self.upstream_status_code {
+            if status_code.is_client_error() || status_code.is_server_error() {
+                info!(
+                    "[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_RESPONSE: status={} body_size={}",
+                    self.request_identifier(),
+                    status_code.as_u16(),
+                    body_size
+                );
+
+                // For error responses, forward the upstream error directly without parsing
+                if body_size > 0 {
+                    if let Ok(body) = self.read_raw_response_body(body_size) {
+                        debug!(
+                            "[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_BODY: {}",
+                            self.request_identifier(),
+                            String::from_utf8_lossy(&body)
+                        );
+                        // Forward the error response as-is
+                        self.set_http_response_body(0, body_size, &body);
+                    }
+                }
+                return Action::Continue;
+            }
+        }
+
         match self.client_api {
             Some(SupportedAPIs::OpenAIChatCompletions(_)) => {}
             Some(SupportedAPIs::AnthropicMessagesAPI(_)) => {}
diff --git a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
index 49245d55..c829b395 100644
--- a/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
+++ b/demos/use_cases/model_alias_routing/arch_config_with_aliases.yaml
@@ -10,10 +10,13 @@ listeners:
 llm_providers:
 
   # OpenAI Models
-  - model: openai/gpt-4o-mini
+  - model: openai/gpt-5-mini-2025-08-07
     access_key: $OPENAI_API_KEY
     default: true
 
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+
   - model: openai/o3
     access_key: $OPENAI_API_KEY
 
@@ -41,7 +44,7 @@ llm_providers:
 model_aliases:
   # Alias for summarization tasks -> fast/cheap model
   arch.summarize.v1:
-    target: gpt-4o-mini
+    target: gpt-5-mini-2025-08-07
 
   # Alias for general purpose tasks -> latest model
   arch.v1:
@@ -61,7 +64,7 @@ model_aliases:
 
   # Semantic aliases
   summary-model:
-    target: gpt-4o-mini
+    target: gpt-5-mini-2025-08-07
 
   chat-model:
     target: llama3.1
diff --git a/tests/e2e/test_model_alias_routing.py b/tests/e2e/test_model_alias_routing.py
index c0f6f7c5..696507f2 100644
--- a/tests/e2e/test_model_alias_routing.py
+++ b/tests/e2e/test_model_alias_routing.py
@@ -33,8 +33,8 @@ def test_openai_client_with_alias_arch_summarize_v1():
     )
 
     completion = client.chat.completions.create(
-        model="arch.summarize.v1",  # This should resolve to 4o-mini
-        max_tokens=50,
+        model="arch.summarize.v1",  # This should resolve to 5o-mini
+        max_completion_tokens=500,  # Increased token limit to avoid truncation and because the 5o-mini uses reasoning tokens
         messages=[
             {
                 "role": "user",
@@ -60,7 +60,7 @@ def test_openai_client_with_alias_arch_v1():
 
     completion = client.chat.completions.create(
         model="arch.v1",  # This should resolve to gpt-o3
-        max_tokens=50,
+        max_completion_tokens=500,
         messages=[
             {
                 "role": "user",
@@ -82,8 +82,8 @@ def test_anthropic_client_with_alias_arch_summarize_v1():
     client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
 
     message = client.messages.create(
-        model="arch.summarize.v1",  # This should resolve to 4o-mini
-        max_tokens=50,
+        model="arch.summarize.v1",  # This should resolve to 5o-mini
+        max_tokens=500,
         messages=[
             {
                 "role": "user",
@@ -108,7 +108,7 @@ def test_anthropic_client_with_alias_arch_v1():
 
     message = client.messages.create(
         model="arch.v1",  # This should resolve to o3
-        max_tokens=50,
+        max_tokens=500,
         messages=[
             {
                 "role": "user",
@@ -135,8 +135,8 @@ def test_openai_client_with_alias_streaming():
     )
 
     stream = client.chat.completions.create(
-        model="arch.summarize.v1",  # This should resolve to 4o-mini
-        max_tokens=50,
+        model="arch.summarize.v1",  # This should resolve to 5o-mini
+        max_completion_tokens=500,
         messages=[
             {
                 "role": "user",
@@ -166,8 +166,8 @@ def test_anthropic_client_with_alias_streaming():
     client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
 
     with client.messages.stream(
-        model="arch.summarize.v1",  # This should resolve to 4o-mini
-        max_tokens=50,
+        model="arch.summarize.v1",  # This should resolve to 5o-mini
+        max_tokens=500,
         messages=[
             {
                 "role": "user",
@@ -184,6 +184,89 @@ def test_anthropic_client_with_alias_streaming():
     assert full_text == "Hello from streaming alias via Anthropic!"
 
 
+def test_400_error_handling_with_alias():
+    """Test that 400 errors from upstream are properly returned by archgw"""
+    logger.info(
+        "Testing 400 error handling with arch.summarize.v1 and invalid parameter"
+    )
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    try:
+        completion = client.chat.completions.create(
+            model="arch.summarize.v1",  # This should resolve to gpt-5-mini-2025-08-07
+            max_completion_tokens=50,
+            temperature=0.7,  # This is a typo - should be "temperature", which should trigger a 400 error
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Hello, this should trigger a 400 error due to invalid parameter name",
+                }
+            ],
+        )
+        # If we reach here, the request unexpectedly succeeded
+        logger.error(
+            f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
+        )
+        assert False, "Expected 400 error but request succeeded"
+    except openai.BadRequestError as e:
+        # This is what we expect - a 400 Bad Request error
+        logger.info(f"Correctly received 400 Bad Request error: {e}")
+        assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
+        logger.info("✓ 400 error handling working correctly")
+    except Exception as e:
+        # Any other exception is unexpected
+        logger.error(
+            f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
+        )
+        assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"
+
+
+def test_400_error_handling_unsupported_parameter():
+    """Test that 400 errors for unsupported parameters are properly returned by archgw"""
+    logger.info("Testing 400 error handling with unsupported max_tokens parameter")
+
+    base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
+    client = openai.OpenAI(
+        api_key="test-key",
+        base_url=f"{base_url}/v1",
+    )
+
+    try:
+        # Use the deprecated max_tokens parameter which should trigger a 400 error
+        completion = client.chat.completions.create(
+            model="arch.summarize.v1",  # This should resolve to gpt-5-mini-2025-08-07
+            max_tokens=150,  # This parameter is unsupported for newer models, should use max_completion_tokens
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Hello, this should trigger a 400 error due to unsupported max_tokens parameter",
+                }
+            ],
+        )
+        # If we reach here, the request unexpectedly succeeded
+        logger.error(
+            f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
+        )
+        assert False, "Expected 400 error but request succeeded"
+    except openai.BadRequestError as e:
+        # This is what we expect - a 400 Bad Request error
+        logger.info(f"Correctly received 400 Bad Request error: {e}")
+        assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
+        assert "max_tokens" in str(e), "Expected error message to mention max_tokens"
+        logger.info("✓ 400 error handling for unsupported parameters working correctly")
+    except Exception as e:
+        # Any other exception is unexpected
+        logger.error(
+            f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
+        )
+        assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"
+
+
 def test_nonexistent_alias():
     """Test that using a non-existent alias falls back to treating it as a direct model name"""
     logger.info(
@@ -199,7 +282,7 @@ def test_nonexistent_alias():
     try:
         completion = client.chat.completions.create(
             model="nonexistent.alias",  # This alias doesn't exist
-            max_tokens=50,
+            max_completion_tokens=50,
             messages=[
                 {
                     "role": "user",
@@ -231,8 +314,8 @@ def test_direct_model_4o_mini_openai():
     )
 
     completion = client.chat.completions.create(
-        model="4o-mini",  # Direct model name
-        max_tokens=50,
+        model="gpt-4o-mini",  # Direct model name
+        max_completion_tokens=50,
         messages=[
             {
                 "role": "user",
@@ -254,7 +337,7 @@ def test_direct_model_4o_mini_anthropic():
     client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
 
     message = client.messages.create(
-        model="4o-mini",  # Direct model name
+        model="gpt-4o-mini",  # Direct model name
         max_tokens=50,
         messages=[
             {