diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index 4e44a9ff..56af01b5 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -9,7 +9,7 @@ use common::llm_providers::LlmProviders; use common::ratelimit; use common::stats::Gauge; use common::tracing::TraceData; -use log::debug; +use log::trace; use log::warn; use proxy_wasm::traits::*; use proxy_wasm::types::*; @@ -103,10 +103,8 @@ impl RootContext for FilterContext { fn on_tick(&mut self) { let _ = self.traces_queue.try_lock().map(|mut traces_queue| { while let Some(trace) = traces_queue.pop_front() { - debug!("trace received: {:?}", trace); - let trace_str = serde_json::to_string(&trace).unwrap(); - debug!("trace: {}", trace_str); + trace!("trace details: {}", trace_str); let call_args = CallArgs::new( OTEL_COLLECTOR_HTTP, OTEL_POST_PATH, @@ -139,7 +137,7 @@ impl Context for FilterContext { _body_size: usize, _num_trailers: usize, ) { - debug!( + trace!( "||| on_http_call_response called with token_id: {:?} |||", token_id ); @@ -151,7 +149,7 @@ impl Context for FilterContext { .expect("invalid token_id"); if let Some(status) = self.get_http_call_response_header(":status") { - debug!("trace response status: {:?}", status); + trace!("trace response status: {:?}", status); }; } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 817bcefb..a0714e80 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -153,7 +153,7 @@ impl StreamContext { self.metrics .input_sequence_length .record(token_count as u64); - log::debug!("Recorded input token count: {}", token_count); + trace!("Recorded input token count: {}", token_count); // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { @@ -164,7 +164,7 @@ impl StreamContext { NonZero::new(token_count as u32).unwrap(), )?; } else { - log::debug!("No rate limit applied for model: {}", model); + trace!("No rate limit applied for model: {}", model); } Ok(()) @@ -331,7 +331,7 @@ impl HttpContext for StreamContext { Ok(duration) => { // Convert the duration to milliseconds let duration_ms = duration.as_millis(); - debug!("Total latency: {} milliseconds", duration_ms); + debug!("request latency: {}ms", duration_ms); // Record the latency to the latency histogram self.metrics.request_latency.record(duration_ms as u64); @@ -339,11 +339,14 @@ impl HttpContext for StreamContext { // Compute the time per output token let tpot = duration_ms as u64 / self.response_tokens as u64; - debug!("Time per output token: {} milliseconds", tpot); // Record the time per output token self.metrics.time_per_output_token.record(tpot); - debug!("Tokens per second: {}", 1000 / tpot); + trace!( + "time per token: {}ms, tokens per second: {}", + tpot, + 1000 / tpot + ); // Record the tokens per second self.metrics.tokens_per_second.record(1000 / tpot); } @@ -500,7 +503,7 @@ impl HttpContext for StreamContext { match current_time.duration_since(self.start_time) { Ok(duration) => { let duration_ms = duration.as_millis(); - debug!("Time to First Token (TTFT): {} milliseconds", duration_ms); + debug!("time to first token: {}ms", duration_ms); self.ttft_duration = Some(duration); self.metrics.time_to_first_token.record(duration_ms as u64); } diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index 39d144f1..fc35877f 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -14,7 +14,7 @@ use common::http::{CallArgs, Client}; use common::stats::Gauge; use derivative::Derivative; use http::StatusCode; -use log::{debug, info, warn}; +use log::{debug, warn}; use proxy_wasm::traits::*; use serde_yaml::Value; use std::cell::RefCell; @@ -263,7 +263,7 @@ impl StreamContext { ); } - // update prompt target name from the tool call + // update prompt target name from the tool call response callout_context.prompt_target_name = Some(self.tool_calls.as_ref().unwrap()[0].function.name.clone()); @@ -364,7 +364,6 @@ impl StreamContext { let http_status = self .get_http_call_response_header(":status") .unwrap_or(StatusCode::OK.as_str().to_string()); - debug!("api_call_response_handler: http_status: {}", http_status); if http_status != StatusCode::OK.as_str() { warn!( "api server responded with non 2xx status code: {}", @@ -446,22 +445,20 @@ impl StreamContext { fn get_system_prompt(&self, prompt_target: Option) -> Option { match prompt_target { None => self.system_prompt.as_ref().clone(), - Some(prompt_target) => prompt_target.system_prompt, + Some(prompt_target) => match prompt_target.system_prompt { + None => self.system_prompt.as_ref().clone(), + Some(system_prompt) => Some(system_prompt), + }, } } - fn filter_out_arch_messages(&self, messages: &Vec) -> Vec { + fn filter_out_arch_messages(&self, messages: &[Message]) -> Vec { messages - .into_iter() + .iter() .filter(|m| { - if m.role == TOOL_ROLE + !(m.role == TOOL_ROLE || m.content.is_none() - || (m.tool_calls.is_some() && !m.tool_calls.as_ref().unwrap().is_empty()) - { - true - } else { - false - } + || (m.tool_calls.is_some() && !m.tool_calls.as_ref().unwrap().is_empty())) }) .cloned() .collect() @@ -470,7 +467,6 @@ impl StreamContext { fn construct_llm_messages(&mut self, callout_context: &StreamCallContext) -> Vec { let mut messages: Vec = Vec::new(); - info!("prompt target: {:?}", callout_context.prompt_target_name); // add system prompt let system_prompt = match callout_context.prompt_target_name.as_ref() { None => self.system_prompt.as_ref().clone(), @@ -479,8 +475,6 @@ impl StreamContext { } }; - info!("system_prompt: {:?}", system_prompt); - if system_prompt.is_some() { let system_prompt_message = Message { role: SYSTEM_ROLE.to_string(),