From cb8e2a772b6ef14891974e24e958a919d03a3f74 Mon Sep 17 00:00:00 2001
From: aayushwhiz <aayushwhiz@gmail.com>
Date: Thu, 7 Nov 2024 15:32:28 -0800
Subject: [PATCH] update stats to output input_sequence_length Histogram

Changes the enforce_ratelimit function by getting token count regardless
of if there is a ratelimit or not, allowing for metric to be saved. This
essentially is the token count of what is sent to openai, but that is
not the tokens being sent by user, so rather than info about usage
statistics, it's more relavant to price or cost. Not yet sure if this is
the best way to go, but i'll use it for now.
---
 crates/llm_gateway/src/filter_context.rs |  3 +-
 crates/llm_gateway/src/stream_context.rs | 37 +++++++++++++++++++-----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs
index c518e244..23f30899 100644
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@@ -21,7 +21,7 @@ pub struct WasmMetrics {
     pub time_per_output_token: Histogram,
     pub latency: Histogram,
     pub output_sequence_length: Histogram,
-    // TODO: Add Input Sequence Length
+    pub input_sequence_length: Histogram,
 }
 
 impl WasmMetrics {
@@ -33,6 +33,7 @@ impl WasmMetrics {
             time_per_output_token: Histogram::new(String::from("time_per_output_token")),
             latency: Histogram::new(String::from("latency")),
             output_sequence_length: Histogram::new(String::from("output_sequence_length")),
+            input_sequence_length: Histogram::new(String::from("input_sequence_length")),
         }
     }
 }
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 298a9f68..8c3ce4c1 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -133,16 +133,39 @@ impl StreamContext {
         model: &str,
         json_string: &str,
     ) -> Result<(), ratelimit::Error> {
+        // Tokenize and record token count.
+        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
+
+        // Record the token count to metrics.
+        self.metrics
+            .input_sequence_length
+            .record(token_count as u64);
+        log::debug!("Recorded input token count: {}", token_count);
+
+        // Check if rate limiting needs to be applied.
         if let Some(selector) = self.ratelimit_selector.take() {
-            // Tokenize and Ratelimit.
-            if let Ok(token_count) = tokenizer::token_count(model, json_string) {
-                ratelimit::ratelimits(None).read().unwrap().check_limit(
-                    model.to_owned(),
-                    selector,
-                    NonZero::new(token_count as u32).unwrap(),
-                )?;
+            log::debug!("Rate limiting applied for model: {}", model);
+            let result = ratelimit::ratelimits(None).read().unwrap().check_limit(
+                model.to_owned(),
+                selector,
+                NonZero::new(token_count as u32).unwrap(),
+            );
+
+            match result {
+                Ok(_) => log::debug!("Rate limit check passed for model: {}", model),
+                Err(e) => {
+                    log::debug!(
+                        "Rate limit check failed for model: {} with error: {:?}",
+                        model,
+                        e
+                    );
+                    return Err(e);
+                }
             }
+        } else {
+            log::debug!("No rate limit applied for model: {}", model);
         }
+
         Ok(())
     }
 }