From cb8e2a772b6ef14891974e24e958a919d03a3f74 Mon Sep 17 00:00:00 2001 From: aayushwhiz Date: Thu, 7 Nov 2024 15:32:28 -0800 Subject: [PATCH] update stats to output input_sequence_length Histogram Changes the enforce_ratelimit function by getting token count regardless of if there is a ratelimit or not, allowing for metric to be saved. This essentially is the token count of what is sent to openai, but that is not the tokens being sent by user, so rather than info about usage statistics, it's more relavant to price or cost. Not yet sure if this is the best way to go, but i'll use it for now. --- crates/llm_gateway/src/filter_context.rs | 3 +- crates/llm_gateway/src/stream_context.rs | 37 +++++++++++++++++++----- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index c518e244..23f30899 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -21,7 +21,7 @@ pub struct WasmMetrics { pub time_per_output_token: Histogram, pub latency: Histogram, pub output_sequence_length: Histogram, - // TODO: Add Input Sequence Length + pub input_sequence_length: Histogram, } impl WasmMetrics { @@ -33,6 +33,7 @@ impl WasmMetrics { time_per_output_token: Histogram::new(String::from("time_per_output_token")), latency: Histogram::new(String::from("latency")), output_sequence_length: Histogram::new(String::from("output_sequence_length")), + input_sequence_length: Histogram::new(String::from("input_sequence_length")), } } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 298a9f68..8c3ce4c1 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -133,16 +133,39 @@ impl StreamContext { model: &str, json_string: &str, ) -> Result<(), ratelimit::Error> { + // Tokenize and record token count. + let token_count = tokenizer::token_count(model, json_string).unwrap_or(0); + + // Record the token count to metrics. + self.metrics + .input_sequence_length + .record(token_count as u64); + log::debug!("Recorded input token count: {}", token_count); + + // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { - // Tokenize and Ratelimit. - if let Ok(token_count) = tokenizer::token_count(model, json_string) { - ratelimit::ratelimits(None).read().unwrap().check_limit( - model.to_owned(), - selector, - NonZero::new(token_count as u32).unwrap(), - )?; + log::debug!("Rate limiting applied for model: {}", model); + let result = ratelimit::ratelimits(None).read().unwrap().check_limit( + model.to_owned(), + selector, + NonZero::new(token_count as u32).unwrap(), + ); + + match result { + Ok(_) => log::debug!("Rate limit check passed for model: {}", model), + Err(e) => { + log::debug!( + "Rate limit check failed for model: {} with error: {:?}", + model, + e + ); + return Err(e); + } } + } else { + log::debug!("No rate limit applied for model: {}", model); } + Ok(()) } }