mirror of
https://github.com/katanemo/plano.git
synced 2026-05-04 21:32:43 +02:00
fixed changes related to max_tokens and processing http error codes like 400 properly (#574)
Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-257.local>
This commit is contained in:
parent
7ce8d44d8e
commit
03c2cf6f0d
6 changed files with 157 additions and 23 deletions
|
|
@ -203,9 +203,10 @@ pub async fn chat(
|
|||
}
|
||||
};
|
||||
|
||||
// copy over the headers from the original response
|
||||
// copy over the headers and status code from the original response
|
||||
let response_headers = llm_response.headers().clone();
|
||||
let mut response = Response::builder();
|
||||
let upstream_status = llm_response.status();
|
||||
let mut response = Response::builder().status(upstream_status);
|
||||
let headers = response.headers_mut().unwrap();
|
||||
for (header_name, header_value) in response_headers.iter() {
|
||||
headers.insert(header_name, header_value.clone());
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ impl TryFrom<AnthropicMessagesRequest> for ChatCompletionsRequest {
|
|||
messages: openai_messages,
|
||||
temperature: req.temperature,
|
||||
top_p: req.top_p,
|
||||
max_tokens: Some(req.max_tokens),
|
||||
max_completion_tokens: Some(req.max_tokens),
|
||||
stream: req.stream,
|
||||
stop: req.stop_sequences,
|
||||
tools: openai_tools,
|
||||
|
|
@ -142,7 +142,9 @@ impl TryFrom<ChatCompletionsRequest> for AnthropicMessagesRequest {
|
|||
model: req.model,
|
||||
system: system_prompt,
|
||||
messages,
|
||||
max_tokens: req.max_tokens.unwrap_or(DEFAULT_MAX_TOKENS),
|
||||
max_tokens: req.max_completion_tokens
|
||||
.or(req.max_tokens)
|
||||
.unwrap_or(DEFAULT_MAX_TOKENS),
|
||||
container: None,
|
||||
mcp_servers: None,
|
||||
service_tier: None,
|
||||
|
|
@ -1079,7 +1081,7 @@ mod tests {
|
|||
|
||||
assert_eq!(openai_req.model, "claude-3-sonnet-20240229");
|
||||
assert_eq!(openai_req.messages.len(), 2); // system + user message
|
||||
assert_eq!(openai_req.max_tokens, Some(1024));
|
||||
assert_eq!(openai_req.max_completion_tokens, Some(1024));
|
||||
assert_eq!(openai_req.temperature, Some(0.7));
|
||||
assert_eq!(openai_req.top_p, Some(0.9));
|
||||
assert_eq!(openai_req.stream, Some(false));
|
||||
|
|
|
|||
|
|
@ -360,6 +360,9 @@ mod tests {
|
|||
assert_eq!(openai_req.model, openai_req2.model);
|
||||
assert_eq!(openai_req.messages[0].role, openai_req2.messages[0].role);
|
||||
assert_eq!(openai_req.messages[0].content.extract_text(), openai_req2.messages[0].content.extract_text());
|
||||
assert_eq!(openai_req.max_tokens, openai_req2.max_tokens);
|
||||
// After roundtrip, deprecated max_tokens should be converted to max_completion_tokens
|
||||
let original_max_tokens = openai_req.max_completion_tokens.or(openai_req.max_tokens);
|
||||
let roundtrip_max_tokens = openai_req2.max_completion_tokens.or(openai_req2.max_tokens);
|
||||
assert_eq!(original_max_tokens, roundtrip_max_tokens);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,8 @@ pub struct StreamContext {
|
|||
traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
|
||||
overrides: Rc<Option<Overrides>>,
|
||||
user_message: Option<String>,
|
||||
/// Store upstream response status code to handle error responses gracefully
|
||||
upstream_status_code: Option<StatusCode>,
|
||||
}
|
||||
|
||||
impl StreamContext {
|
||||
|
|
@ -72,6 +74,7 @@ impl StreamContext {
|
|||
traces_queue,
|
||||
request_body_sent_time: None,
|
||||
user_message: None,
|
||||
upstream_status_code: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -871,6 +874,19 @@ impl HttpContext for StreamContext {
|
|||
}
|
||||
|
||||
fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
|
||||
// Capture the upstream response status code to handle errors appropriately
|
||||
if let Some(status_str) = self.get_http_response_header(":status") {
|
||||
if let Ok(status_code) = status_str.parse::<u16>() {
|
||||
self.upstream_status_code = StatusCode::from_u16(status_code).ok();
|
||||
|
||||
info!(
|
||||
"[ARCHGW_REQ_ID:{}] UPSTREAM_RESPONSE_STATUS: {}",
|
||||
self.request_identifier(),
|
||||
status_code
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.remove_http_response_header("content-length");
|
||||
self.remove_http_response_header("content-encoding");
|
||||
|
||||
|
|
@ -888,6 +904,32 @@ impl HttpContext for StreamContext {
|
|||
return Action::Continue;
|
||||
}
|
||||
|
||||
// Check if this is an error response from upstream
|
||||
if let Some(status_code) = &self.upstream_status_code {
|
||||
if status_code.is_client_error() || status_code.is_server_error() {
|
||||
info!(
|
||||
"[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_RESPONSE: status={} body_size={}",
|
||||
self.request_identifier(),
|
||||
status_code.as_u16(),
|
||||
body_size
|
||||
);
|
||||
|
||||
// For error responses, forward the upstream error directly without parsing
|
||||
if body_size > 0 {
|
||||
if let Ok(body) = self.read_raw_response_body(body_size) {
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] UPSTREAM_ERROR_BODY: {}",
|
||||
self.request_identifier(),
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
// Forward the error response as-is
|
||||
self.set_http_response_body(0, body_size, &body);
|
||||
}
|
||||
}
|
||||
return Action::Continue;
|
||||
}
|
||||
}
|
||||
|
||||
match self.client_api {
|
||||
Some(SupportedAPIs::OpenAIChatCompletions(_)) => {}
|
||||
Some(SupportedAPIs::AnthropicMessagesAPI(_)) => {}
|
||||
|
|
|
|||
|
|
@ -10,10 +10,13 @@ listeners:
|
|||
llm_providers:
|
||||
|
||||
# OpenAI Models
|
||||
- model: openai/gpt-4o-mini
|
||||
- model: openai/gpt-5-mini-2025-08-07
|
||||
access_key: $OPENAI_API_KEY
|
||||
default: true
|
||||
|
||||
- model: openai/gpt-4o-mini
|
||||
access_key: $OPENAI_API_KEY
|
||||
|
||||
- model: openai/o3
|
||||
access_key: $OPENAI_API_KEY
|
||||
|
||||
|
|
@ -41,7 +44,7 @@ llm_providers:
|
|||
model_aliases:
|
||||
# Alias for summarization tasks -> fast/cheap model
|
||||
arch.summarize.v1:
|
||||
target: gpt-4o-mini
|
||||
target: gpt-5-mini-2025-08-07
|
||||
|
||||
# Alias for general purpose tasks -> latest model
|
||||
arch.v1:
|
||||
|
|
@ -61,7 +64,7 @@ model_aliases:
|
|||
|
||||
# Semantic aliases
|
||||
summary-model:
|
||||
target: gpt-4o-mini
|
||||
target: gpt-5-mini-2025-08-07
|
||||
|
||||
chat-model:
|
||||
target: llama3.1
|
||||
|
|
|
|||
|
|
@ -33,8 +33,8 @@ def test_openai_client_with_alias_arch_summarize_v1():
|
|||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="arch.summarize.v1", # This should resolve to 4o-mini
|
||||
max_tokens=50,
|
||||
model="arch.summarize.v1", # This should resolve to 5o-mini
|
||||
max_completion_tokens=500, # Increased token limit to avoid truncation and because the 5o-mini uses reasoning tokens
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -60,7 +60,7 @@ def test_openai_client_with_alias_arch_v1():
|
|||
|
||||
completion = client.chat.completions.create(
|
||||
model="arch.v1", # This should resolve to gpt-o3
|
||||
max_tokens=50,
|
||||
max_completion_tokens=500,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -82,8 +82,8 @@ def test_anthropic_client_with_alias_arch_summarize_v1():
|
|||
client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
|
||||
|
||||
message = client.messages.create(
|
||||
model="arch.summarize.v1", # This should resolve to 4o-mini
|
||||
max_tokens=50,
|
||||
model="arch.summarize.v1", # This should resolve to 5o-mini
|
||||
max_tokens=500,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -108,7 +108,7 @@ def test_anthropic_client_with_alias_arch_v1():
|
|||
|
||||
message = client.messages.create(
|
||||
model="arch.v1", # This should resolve to o3
|
||||
max_tokens=50,
|
||||
max_tokens=500,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -135,8 +135,8 @@ def test_openai_client_with_alias_streaming():
|
|||
)
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model="arch.summarize.v1", # This should resolve to 4o-mini
|
||||
max_tokens=50,
|
||||
model="arch.summarize.v1", # This should resolve to 5o-mini
|
||||
max_completion_tokens=500,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -166,8 +166,8 @@ def test_anthropic_client_with_alias_streaming():
|
|||
client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
|
||||
|
||||
with client.messages.stream(
|
||||
model="arch.summarize.v1", # This should resolve to 4o-mini
|
||||
max_tokens=50,
|
||||
model="arch.summarize.v1", # This should resolve to 5o-mini
|
||||
max_tokens=500,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -184,6 +184,89 @@ def test_anthropic_client_with_alias_streaming():
|
|||
assert full_text == "Hello from streaming alias via Anthropic!"
|
||||
|
||||
|
||||
def test_400_error_handling_with_alias():
|
||||
"""Test that 400 errors from upstream are properly returned by archgw"""
|
||||
logger.info(
|
||||
"Testing 400 error handling with arch.summarize.v1 and invalid parameter"
|
||||
)
|
||||
|
||||
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
||||
client = openai.OpenAI(
|
||||
api_key="test-key",
|
||||
base_url=f"{base_url}/v1",
|
||||
)
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="arch.summarize.v1", # This should resolve to gpt-5-mini-2025-08-07
|
||||
max_completion_tokens=50,
|
||||
temperature=0.7, # This is a typo - should be "temperature", which should trigger a 400 error
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, this should trigger a 400 error due to invalid parameter name",
|
||||
}
|
||||
],
|
||||
)
|
||||
# If we reach here, the request unexpectedly succeeded
|
||||
logger.error(
|
||||
f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
|
||||
)
|
||||
assert False, "Expected 400 error but request succeeded"
|
||||
except openai.BadRequestError as e:
|
||||
# This is what we expect - a 400 Bad Request error
|
||||
logger.info(f"Correctly received 400 Bad Request error: {e}")
|
||||
assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
|
||||
logger.info("✓ 400 error handling working correctly")
|
||||
except Exception as e:
|
||||
# Any other exception is unexpected
|
||||
logger.error(
|
||||
f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
|
||||
)
|
||||
assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def test_400_error_handling_unsupported_parameter():
|
||||
"""Test that 400 errors for unsupported parameters are properly returned by archgw"""
|
||||
logger.info("Testing 400 error handling with unsupported max_tokens parameter")
|
||||
|
||||
base_url = LLM_GATEWAY_ENDPOINT.replace("/v1/chat/completions", "")
|
||||
client = openai.OpenAI(
|
||||
api_key="test-key",
|
||||
base_url=f"{base_url}/v1",
|
||||
)
|
||||
|
||||
try:
|
||||
# Use the deprecated max_tokens parameter which should trigger a 400 error
|
||||
completion = client.chat.completions.create(
|
||||
model="arch.summarize.v1", # This should resolve to gpt-5-mini-2025-08-07
|
||||
max_tokens=150, # This parameter is unsupported for newer models, should use max_completion_tokens
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, this should trigger a 400 error due to unsupported max_tokens parameter",
|
||||
}
|
||||
],
|
||||
)
|
||||
# If we reach here, the request unexpectedly succeeded
|
||||
logger.error(
|
||||
f"Expected 400 error but got successful response: {completion.choices[0].message.content}"
|
||||
)
|
||||
assert False, "Expected 400 error but request succeeded"
|
||||
except openai.BadRequestError as e:
|
||||
# This is what we expect - a 400 Bad Request error
|
||||
logger.info(f"Correctly received 400 Bad Request error: {e}")
|
||||
assert e.status_code == 400, f"Expected status code 400, got {e.status_code}"
|
||||
assert "max_tokens" in str(e), "Expected error message to mention max_tokens"
|
||||
logger.info("✓ 400 error handling for unsupported parameters working correctly")
|
||||
except Exception as e:
|
||||
# Any other exception is unexpected
|
||||
logger.error(
|
||||
f"Unexpected error type (should be BadRequestError): {type(e).__name__}: {e}"
|
||||
)
|
||||
assert False, f"Expected BadRequestError but got {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def test_nonexistent_alias():
|
||||
"""Test that using a non-existent alias falls back to treating it as a direct model name"""
|
||||
logger.info(
|
||||
|
|
@ -199,7 +282,7 @@ def test_nonexistent_alias():
|
|||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="nonexistent.alias", # This alias doesn't exist
|
||||
max_tokens=50,
|
||||
max_completion_tokens=50,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -231,8 +314,8 @@ def test_direct_model_4o_mini_openai():
|
|||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="4o-mini", # Direct model name
|
||||
max_tokens=50,
|
||||
model="gpt-4o-mini", # Direct model name
|
||||
max_completion_tokens=50,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -254,7 +337,7 @@ def test_direct_model_4o_mini_anthropic():
|
|||
client = anthropic.Anthropic(api_key="test-key", base_url=base_url)
|
||||
|
||||
message = client.messages.create(
|
||||
model="4o-mini", # Direct model name
|
||||
model="gpt-4o-mini", # Direct model name
|
||||
max_tokens=50,
|
||||
messages=[
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue