update stats to output input_sequence_length Histogram

Changes the enforce_ratelimit function by getting token count regardless of if there is a ratelimit or not, allowing for metric to be saved. This essentially is the token count of what is sent to openai, but that is not the tokens being sent by user, so rather than info about usage statistics, it's more relavant to price or cost. Not yet sure if this is the best way to go, but i'll use it for now.
2026-06-17 15:25:17 +02:00 · 2024-11-07 15:32:28 -08:00 · 2024-11-07 15:32:28 -08:00 · cb8e2a772b
commit cb8e2a772b
parent 8fb5c4eceb
2 changed files with 32 additions and 8 deletions
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -21,7 +21,7 @@ pub struct WasmMetrics {
    pub time_per_output_token: Histogram,
    pub latency: Histogram,
    pub output_sequence_length: Histogram,
-    // TODO: Add Input Sequence Length
+    pub input_sequence_length: Histogram,
 }

 impl WasmMetrics {
@ -33,6 +33,7 @@ impl WasmMetrics {
            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
            latency: Histogram::new(String::from("latency")),
            output_sequence_length: Histogram::new(String::from("output_sequence_length")),
+            input_sequence_length: Histogram::new(String::from("input_sequence_length")),
        }
    }
 }
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -133,16 +133,39 @@ impl StreamContext {
        model: &str,
        json_string: &str,
    ) -> Result<(), ratelimit::Error> {
+        // Tokenize and record token count.
+        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
+
+        // Record the token count to metrics.
+        self.metrics
+            .input_sequence_length
+            .record(token_count as u64);
+        log::debug!("Recorded input token count: {}", token_count);
+
+        // Check if rate limiting needs to be applied.
        if let Some(selector) = self.ratelimit_selector.take() {
-            // Tokenize and Ratelimit.
-            if let Ok(token_count) = tokenizer::token_count(model, json_string) {
-                ratelimit::ratelimits(None).read().unwrap().check_limit(
-                    model.to_owned(),
-                    selector,
-                    NonZero::new(token_count as u32).unwrap(),
-                )?;
+            log::debug!("Rate limiting applied for model: {}", model);
+            let result = ratelimit::ratelimits(None).read().unwrap().check_limit(
+                model.to_owned(),
+                selector,
+                NonZero::new(token_count as u32).unwrap(),
+            );
+
+            match result {
+                Ok(_) => log::debug!("Rate limit check passed for model: {}", model),
+                Err(e) => {
+                    log::debug!(
+                        "Rate limit check failed for model: {} with error: {:?}",
+                        model,
+                        e
+                    );
+                    return Err(e);
+                }
            }
+        } else {
+            log::debug!("No rate limit applied for model: {}", model);
        }
+
        Ok(())
    }
 }