diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 6fc4e2d4..66470253 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -196,20 +196,22 @@ impl HttpContext for StreamContext { // Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto // the lifecycle of the http request and response. fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { + // debug!("headers: {:?}", self.get_http_request_headers()); let request_path = self.get_http_request_header(":path").unwrap_or_default(); if request_path == HEALTHZ_PATH { self.send_http_response(200, vec![], None); return Action::Continue; } - let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER); - let use_agent_orchestrator = match self.overrides.as_ref() { Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(), None => false, }; - if let Some(routing_header_value) = routing_header_value.as_ref() { + let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER); + + if routing_header_value.is_some() && !routing_header_value.as_ref().unwrap().is_empty() { + let routing_header_value = routing_header_value.as_ref().unwrap(); info!("routing header already set: {}", routing_header_value); self.llm_provider = Some(Rc::new(LlmProvider { name: routing_header_value.to_string(), @@ -386,6 +388,8 @@ impl HttpContext for StreamContext { } }; + // trace!("on_http_request_body: update request body to: {}, len: {}", String::from_utf8_lossy(&deserialized_body_bytes), deserialized_body_bytes.len()); + self.set_http_request_body(0, body_size, &deserialized_body_bytes); Action::Continue diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs index ccd4bb4c..d6cf001f 100644 --- a/crates/llm_gateway/tests/integration.rs +++ b/crates/llm_gateway/tests/integration.rs @@ -202,20 +202,7 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() { request_headers_expectations(&mut module, http_context); // Request Body - let chat_completions_request_body = "\ - {\ - \"messages\": [\ - {\ - \"role\": \"system\",\ - \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem.\"\ - }\ - ],\ - \"model\": \"gpt-4\"\ - }"; + let chat_completions_request_body = r#"{"model":"gpt-4","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem."}]}"#; module .call_proxy_on_request_body( @@ -229,7 +216,6 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() { .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 21) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) @@ -268,18 +254,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() { request_headers_expectations(&mut module, http_context); // Request Body - let incomplete_chat_completions_request_body = "\ - {\ - \"messages\": [\ - {\ - \"role\": \"system\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\ - }\ - ]\ - }"; + let incomplete_chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"Compose a poem that explains the concept of recursion in programming."}]}"#; module .call_proxy_on_request_body( @@ -290,7 +265,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() { .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(incomplete_chat_completions_request_body)) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4")) + .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4")) .expect_send_local_response( Some(StatusCode::BAD_REQUEST.as_u16().into()), None, @@ -300,8 +275,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() { .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_metric_record("input_sequence_length", 14) - .expect_log(Some(LogLevel::Debug), None) + .expect_metric_record("input_sequence_length", 13) .expect_log(Some(LogLevel::Debug), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -359,11 +333,10 @@ fn llm_gateway_request_ratelimited() { .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 107) + .expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4")) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107")) + .expect_log(Some(LogLevel::Warn), Some(r#"server error occurred: exceeded limit provider=gpt-4, selector=Header { key: "selector-key", value: "selector-value" }, tokens_used=107"#)) .expect_send_local_response( Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()), None, @@ -399,20 +372,7 @@ fn llm_gateway_request_not_ratelimited() { normal_flow(&mut module, filter_context, http_context); // give shorter body to avoid rate limiting - let chat_completions_request_body = "\ -{\ - \"messages\": [\ - {\ - \"role\": \"system\",\ - \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\ - }\ - ],\ - \"model\": \"gpt-4\"\ -}"; + let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#; module .call_proxy_on_request_body( @@ -427,7 +387,6 @@ fn llm_gateway_request_not_ratelimited() { .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) @@ -460,20 +419,7 @@ fn llm_gateway_override_model_name() { normal_flow(&mut module, filter_context, http_context); // give shorter body to avoid rate limiting - let chat_completions_request_body = "\ -{\ - \"model\": \"o1-mini\",\ - \"messages\": [\ - {\ - \"role\": \"system\",\ - \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\ - }\ - ] -}"; + let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#; module .call_proxy_on_request_body( @@ -485,8 +431,7 @@ fn llm_gateway_override_model_name() { .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4")) - .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4")) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) @@ -521,19 +466,7 @@ fn llm_gateway_override_use_default_model() { normal_flow(&mut module, filter_context, http_context); // give shorter body to avoid rate limiting - let chat_completions_request_body = "\ -{\ - \"messages\": [\ - {\ - \"role\": \"system\",\ - \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\ - }\ - ] -}"; + let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#; module .call_proxy_on_request_body( @@ -547,14 +480,13 @@ fn llm_gateway_override_use_default_model() { // The actual call is not important in this test, we just need to grab the token_id .expect_log( Some(LogLevel::Info), - Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"), + Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"), ) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) - .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4")) + .expect_log(Some(LogLevel::Debug), Some(r#"Checking limit for provider=gpt-4, with selector=Header { key: "selector-key", value: "selector-value" }, consuming tokens=29"#)) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -584,20 +516,7 @@ fn llm_gateway_override_use_model_name_none() { normal_flow(&mut module, filter_context, http_context); // give shorter body to avoid rate limiting - let chat_completions_request_body = "\ -{\ - \"model\": \"none\",\ - \"messages\": [\ - {\ - \"role\": \"system\",\ - \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\ - },\ - {\ - \"role\": \"user\",\ - \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\ - }\ - ] -}"; + let chat_completions_request_body = r#"{"model":"none","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#; module .call_proxy_on_request_body( @@ -615,7 +534,6 @@ fn llm_gateway_override_use_model_name_none() { .expect_metric_record("input_sequence_length", 29) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap();