diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index c518e244..23f30899 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -21,7 +21,7 @@ pub struct WasmMetrics { pub time_per_output_token: Histogram, pub latency: Histogram, pub output_sequence_length: Histogram, - // TODO: Add Input Sequence Length + pub input_sequence_length: Histogram, } impl WasmMetrics { @@ -33,6 +33,7 @@ impl WasmMetrics { time_per_output_token: Histogram::new(String::from("time_per_output_token")), latency: Histogram::new(String::from("latency")), output_sequence_length: Histogram::new(String::from("output_sequence_length")), + input_sequence_length: Histogram::new(String::from("input_sequence_length")), } } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 298a9f68..8c3ce4c1 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -133,16 +133,39 @@ impl StreamContext { model: &str, json_string: &str, ) -> Result<(), ratelimit::Error> { + // Tokenize and record token count. + let token_count = tokenizer::token_count(model, json_string).unwrap_or(0); + + // Record the token count to metrics. + self.metrics + .input_sequence_length + .record(token_count as u64); + log::debug!("Recorded input token count: {}", token_count); + + // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { - // Tokenize and Ratelimit. - if let Ok(token_count) = tokenizer::token_count(model, json_string) { - ratelimit::ratelimits(None).read().unwrap().check_limit( - model.to_owned(), - selector, - NonZero::new(token_count as u32).unwrap(), - )?; + log::debug!("Rate limiting applied for model: {}", model); + let result = ratelimit::ratelimits(None).read().unwrap().check_limit( + model.to_owned(), + selector, + NonZero::new(token_count as u32).unwrap(), + ); + + match result { + Ok(_) => log::debug!("Rate limit check passed for model: {}", model), + Err(e) => { + log::debug!( + "Rate limit check failed for model: {} with error: {:?}", + model, + e + ); + return Err(e); + } } + } else { + log::debug!("No rate limit applied for model: {}", model); } + Ok(()) } }