update stats to output input_sequence_length Histogram

Changes the enforce_ratelimit function by getting token count regardless
of if there is a ratelimit or not, allowing for metric to be saved. This
essentially is the token count of what is sent to openai, but that is
not the tokens being sent by user, so rather than info about usage
statistics, it's more relavant to price or cost. Not yet sure if this is
the best way to go, but i'll use it for now.
This commit is contained in:
aayushwhiz 2024-11-07 15:32:28 -08:00
parent 8fb5c4eceb
commit cb8e2a772b
2 changed files with 32 additions and 8 deletions

View file

@ -21,7 +21,7 @@ pub struct WasmMetrics {
pub time_per_output_token: Histogram,
pub latency: Histogram,
pub output_sequence_length: Histogram,
// TODO: Add Input Sequence Length
pub input_sequence_length: Histogram,
}
impl WasmMetrics {
@ -33,6 +33,7 @@ impl WasmMetrics {
time_per_output_token: Histogram::new(String::from("time_per_output_token")),
latency: Histogram::new(String::from("latency")),
output_sequence_length: Histogram::new(String::from("output_sequence_length")),
input_sequence_length: Histogram::new(String::from("input_sequence_length")),
}
}
}

View file

@ -133,16 +133,39 @@ impl StreamContext {
model: &str,
json_string: &str,
) -> Result<(), ratelimit::Error> {
// Tokenize and record token count.
let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
// Record the token count to metrics.
self.metrics
.input_sequence_length
.record(token_count as u64);
log::debug!("Recorded input token count: {}", token_count);
// Check if rate limiting needs to be applied.
if let Some(selector) = self.ratelimit_selector.take() {
// Tokenize and Ratelimit.
if let Ok(token_count) = tokenizer::token_count(model, json_string) {
ratelimit::ratelimits(None).read().unwrap().check_limit(
model.to_owned(),
selector,
NonZero::new(token_count as u32).unwrap(),
)?;
log::debug!("Rate limiting applied for model: {}", model);
let result = ratelimit::ratelimits(None).read().unwrap().check_limit(
model.to_owned(),
selector,
NonZero::new(token_count as u32).unwrap(),
);
match result {
Ok(_) => log::debug!("Rate limit check passed for model: {}", model),
Err(e) => {
log::debug!(
"Rate limit check failed for model: {} with error: {:?}",
model,
e
);
return Err(e);
}
}
} else {
log::debug!("No rate limit applied for model: {}", model);
}
Ok(())
}
}