mirror of
https://github.com/katanemo/plano.git
synced 2026-06-29 15:49:40 +02:00
more changes
This commit is contained in:
parent
4ab7665c30
commit
e2d49fb3f2
3 changed files with 47 additions and 41 deletions
|
|
@ -9,7 +9,7 @@ use common::llm_providers::LlmProviders;
|
|||
use common::ratelimit;
|
||||
use common::stats::Gauge;
|
||||
use common::tracing::TraceData;
|
||||
use log::debug;
|
||||
use log::trace;
|
||||
use log::warn;
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
|
|
@ -103,10 +103,8 @@ impl RootContext for FilterContext {
|
|||
fn on_tick(&mut self) {
|
||||
let _ = self.traces_queue.try_lock().map(|mut traces_queue| {
|
||||
while let Some(trace) = traces_queue.pop_front() {
|
||||
debug!("trace received: {:?}", trace);
|
||||
|
||||
let trace_str = serde_json::to_string(&trace).unwrap();
|
||||
debug!("trace: {}", trace_str);
|
||||
trace!("trace details: {}", trace_str);
|
||||
let call_args = CallArgs::new(
|
||||
OTEL_COLLECTOR_HTTP,
|
||||
OTEL_POST_PATH,
|
||||
|
|
@ -139,7 +137,7 @@ impl Context for FilterContext {
|
|||
_body_size: usize,
|
||||
_num_trailers: usize,
|
||||
) {
|
||||
debug!(
|
||||
trace!(
|
||||
"||| on_http_call_response called with token_id: {:?} |||",
|
||||
token_id
|
||||
);
|
||||
|
|
@ -151,7 +149,7 @@ impl Context for FilterContext {
|
|||
.expect("invalid token_id");
|
||||
|
||||
if let Some(status) = self.get_http_call_response_header(":status") {
|
||||
debug!("trace response status: {:?}", status);
|
||||
trace!("trace response status: {:?}", status);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ impl StreamContext {
|
|||
self.metrics
|
||||
.input_sequence_length
|
||||
.record(token_count as u64);
|
||||
log::debug!("Recorded input token count: {}", token_count);
|
||||
trace!("Recorded input token count: {}", token_count);
|
||||
|
||||
// Check if rate limiting needs to be applied.
|
||||
if let Some(selector) = self.ratelimit_selector.take() {
|
||||
|
|
@ -164,7 +164,7 @@ impl StreamContext {
|
|||
NonZero::new(token_count as u32).unwrap(),
|
||||
)?;
|
||||
} else {
|
||||
log::debug!("No rate limit applied for model: {}", model);
|
||||
trace!("No rate limit applied for model: {}", model);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -331,7 +331,7 @@ impl HttpContext for StreamContext {
|
|||
Ok(duration) => {
|
||||
// Convert the duration to milliseconds
|
||||
let duration_ms = duration.as_millis();
|
||||
debug!("Total latency: {} milliseconds", duration_ms);
|
||||
debug!("request latency: {}ms", duration_ms);
|
||||
// Record the latency to the latency histogram
|
||||
self.metrics.request_latency.record(duration_ms as u64);
|
||||
|
||||
|
|
@ -339,11 +339,14 @@ impl HttpContext for StreamContext {
|
|||
// Compute the time per output token
|
||||
let tpot = duration_ms as u64 / self.response_tokens as u64;
|
||||
|
||||
debug!("Time per output token: {} milliseconds", tpot);
|
||||
// Record the time per output token
|
||||
self.metrics.time_per_output_token.record(tpot);
|
||||
|
||||
debug!("Tokens per second: {}", 1000 / tpot);
|
||||
trace!(
|
||||
"time per token: {}ms, tokens per second: {}",
|
||||
tpot,
|
||||
1000 / tpot
|
||||
);
|
||||
// Record the tokens per second
|
||||
self.metrics.tokens_per_second.record(1000 / tpot);
|
||||
}
|
||||
|
|
@ -499,7 +502,7 @@ impl HttpContext for StreamContext {
|
|||
match current_time.duration_since(self.start_time) {
|
||||
Ok(duration) => {
|
||||
let duration_ms = duration.as_millis();
|
||||
debug!("Time to First Token (TTFT): {} milliseconds", duration_ms);
|
||||
debug!("time to first token: {}ms", duration_ms);
|
||||
self.ttft_duration = Some(duration);
|
||||
self.metrics.time_to_first_token.record(duration_ms as u64);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue