diff --git a/crates/common/src/stats.rs b/crates/common/src/stats.rs index 9479fadf..59604f0b 100644 --- a/crates/common/src/stats.rs +++ b/crates/common/src/stats.rs @@ -2,7 +2,6 @@ use log::error; use proxy_wasm::hostcalls; use proxy_wasm::types::*; -#[allow(unused)] pub trait Metric { fn id(&self) -> u32; fn value(&self) -> Result { @@ -14,7 +13,6 @@ pub trait Metric { } } -#[allow(unused)] pub trait IncrementingMetric: Metric { fn increment(&self, offset: i64) { match hostcalls::increment_metric(self.id(), offset) { @@ -24,7 +22,6 @@ pub trait IncrementingMetric: Metric { } } -#[allow(unused)] pub trait RecordingMetric: Metric { fn record(&self, value: u64) { match hostcalls::record_metric(self.id(), value) { @@ -39,7 +36,6 @@ pub struct Counter { id: u32, } -#[allow(unused)] impl Counter { pub fn new(name: String) -> Counter { let returned_id = hostcalls::define_metric(MetricType::Counter, &name) @@ -85,7 +81,6 @@ pub struct Histogram { id: u32, } -#[allow(unused)] impl Histogram { pub fn new(name: String) -> Histogram { let returned_id = hostcalls::define_metric(MetricType::Histogram, &name) diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index b5b279de..16580063 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -1,3 +1,4 @@ +use crate::metrics::Metrics; use crate::stream_context::StreamContext; use common::configuration::Configuration; use common::consts::OTEL_COLLECTOR_HTTP; @@ -6,9 +7,7 @@ use common::http::CallArgs; use common::http::Client; use common::llm_providers::LlmProviders; use common::ratelimit; -use common::stats::Counter; use common::stats::Gauge; -use common::stats::Histogram; use common::tracing::TraceData; use log::debug; use log::warn; @@ -22,39 +21,12 @@ use std::time::Duration; use std::sync::{Arc, Mutex}; -#[derive(Copy, Clone, Debug)] -pub struct WasmMetrics { - pub active_http_calls: Gauge, - pub ratelimited_rq: Counter, - pub time_to_first_token: Histogram, - pub time_per_output_token: Histogram, - pub tokens_per_second: Histogram, - pub request_latency: Histogram, - pub output_sequence_length: Histogram, - pub input_sequence_length: Histogram, -} - -impl WasmMetrics { - fn new() -> WasmMetrics { - WasmMetrics { - active_http_calls: Gauge::new(String::from("active_http_calls")), - ratelimited_rq: Counter::new(String::from("ratelimited_rq")), - time_to_first_token: Histogram::new(String::from("time_to_first_token")), - time_per_output_token: Histogram::new(String::from("time_per_output_token")), - tokens_per_second: Histogram::new(String::from("tokens_per_second")), - request_latency: Histogram::new(String::from("request_latency")), - output_sequence_length: Histogram::new(String::from("output_sequence_length")), - input_sequence_length: Histogram::new(String::from("input_sequence_length")), - } - } -} - #[derive(Debug)] pub struct CallContext {} #[derive(Debug)] pub struct FilterContext { - metrics: Rc, + metrics: Rc, // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request. callouts: RefCell>, llm_providers: Option>, @@ -65,7 +37,7 @@ impl FilterContext { pub fn new() -> FilterContext { FilterContext { callouts: RefCell::new(HashMap::new()), - metrics: Rc::new(WasmMetrics::new()), + metrics: Rc::new(Metrics::new()), llm_providers: None, traces_queue: Arc::new(Mutex::new(VecDeque::new())), } diff --git a/crates/llm_gateway/src/lib.rs b/crates/llm_gateway/src/lib.rs index e2ad9025..f585ba0e 100644 --- a/crates/llm_gateway/src/lib.rs +++ b/crates/llm_gateway/src/lib.rs @@ -3,6 +3,7 @@ use proxy_wasm::traits::*; use proxy_wasm::types::*; mod filter_context; +mod metrics; mod stream_context; proxy_wasm::main! {{ diff --git a/crates/llm_gateway/src/metrics.rs b/crates/llm_gateway/src/metrics.rs new file mode 100644 index 00000000..58eb6a86 --- /dev/null +++ b/crates/llm_gateway/src/metrics.rs @@ -0,0 +1,28 @@ +use common::stats::{Counter, Gauge, Histogram}; + +#[derive(Copy, Clone, Debug)] +pub struct Metrics { + pub active_http_calls: Gauge, + pub ratelimited_rq: Counter, + pub time_to_first_token: Histogram, + pub time_per_output_token: Histogram, + pub tokens_per_second: Histogram, + pub request_latency: Histogram, + pub output_sequence_length: Histogram, + pub input_sequence_length: Histogram, +} + +impl Metrics { + pub fn new() -> Metrics { + Metrics { + active_http_calls: Gauge::new(String::from("active_http_calls")), + ratelimited_rq: Counter::new(String::from("ratelimited_rq")), + time_to_first_token: Histogram::new(String::from("time_to_first_token")), + time_per_output_token: Histogram::new(String::from("time_per_output_token")), + tokens_per_second: Histogram::new(String::from("tokens_per_second")), + request_latency: Histogram::new(String::from("request_latency")), + output_sequence_length: Histogram::new(String::from("output_sequence_length")), + input_sequence_length: Histogram::new(String::from("input_sequence_length")), + } + } +} diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 5d8669a2..f0b26ffe 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -1,4 +1,4 @@ -use crate::filter_context::WasmMetrics; +use crate::metrics::Metrics; use common::api::open_ai::{ ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse, Message, StreamOptions, @@ -12,25 +12,23 @@ use common::errors::ServerError; use common::llm_providers::LlmProviders; use common::pii::obfuscate_auth_header; use common::ratelimit::Header; +use common::stats::{IncrementingMetric, RecordingMetric}; use common::tracing::{Event, Span, TraceData, Traceparent}; use common::{ratelimit, routing, tokenizer}; use http::StatusCode; use log::{debug, trace, warn}; +use proxy_wasm::hostcalls::get_current_time; use proxy_wasm::traits::*; use proxy_wasm::types::*; use std::collections::VecDeque; use std::num::NonZero; use std::rc::Rc; use std::sync::{Arc, Mutex}; - -use common::stats::{IncrementingMetric, RecordingMetric}; - -use proxy_wasm::hostcalls::get_current_time; use std::time::{Duration, SystemTime, UNIX_EPOCH}; pub struct StreamContext { context_id: u32, - metrics: Rc, + metrics: Rc, ratelimit_selector: Option
, streaming_response: bool, response_tokens: usize, @@ -50,7 +48,7 @@ pub struct StreamContext { impl StreamContext { pub fn new( context_id: u32, - metrics: Rc, + metrics: Rc, llm_providers: Rc, traces_queue: Arc>>, ) -> Self { diff --git a/crates/prompt_gateway/src/filter_context.rs b/crates/prompt_gateway/src/filter_context.rs index 966765fe..449be126 100644 --- a/crates/prompt_gateway/src/filter_context.rs +++ b/crates/prompt_gateway/src/filter_context.rs @@ -1,4 +1,5 @@ use crate::embeddings::EmbeddingType; +use crate::metrics::Metrics; use crate::stream_context::StreamContext; use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget, Tracing}; use common::consts::ARCH_UPSTREAM_HOST_HEADER; @@ -21,19 +22,6 @@ use std::collections::HashMap; use std::rc::Rc; use std::time::Duration; -#[derive(Copy, Clone, Debug)] -pub struct WasmMetrics { - pub active_http_calls: Gauge, -} - -impl WasmMetrics { - fn new() -> WasmMetrics { - WasmMetrics { - active_http_calls: Gauge::new(String::from("active_http_calls")), - } - } -} - pub type EmbeddingTypeMap = HashMap>; pub type EmbeddingsStore = HashMap; @@ -45,7 +33,7 @@ pub struct FilterCallContext { #[derive(Debug)] pub struct FilterContext { - metrics: Rc, + metrics: Rc, // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request. callouts: RefCell>, overrides: Rc>, @@ -62,7 +50,7 @@ impl FilterContext { pub fn new() -> FilterContext { FilterContext { callouts: RefCell::new(HashMap::new()), - metrics: Rc::new(WasmMetrics::new()), + metrics: Rc::new(Metrics::new()), system_prompt: Rc::new(None), prompt_targets: Rc::new(HashMap::new()), overrides: Rc::new(None), diff --git a/crates/prompt_gateway/src/lib.rs b/crates/prompt_gateway/src/lib.rs index 8fe85f75..9d828dac 100644 --- a/crates/prompt_gateway/src/lib.rs +++ b/crates/prompt_gateway/src/lib.rs @@ -6,6 +6,7 @@ mod context; mod embeddings; mod filter_context; mod http_context; +mod metrics; mod stream_context; proxy_wasm::main! {{ diff --git a/crates/prompt_gateway/src/metrics.rs b/crates/prompt_gateway/src/metrics.rs new file mode 100644 index 00000000..ff891636 --- /dev/null +++ b/crates/prompt_gateway/src/metrics.rs @@ -0,0 +1,14 @@ +use common::stats::Gauge; + +#[derive(Copy, Clone, Debug)] +pub struct Metrics { + pub active_http_calls: Gauge, +} + +impl Metrics { + pub fn new() -> Metrics { + Metrics { + active_http_calls: Gauge::new(String::from("active_http_calls")), + } + } +} diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index 3a4d2733..8671af63 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -1,5 +1,6 @@ use crate::embeddings::EmbeddingType; -use crate::filter_context::{EmbeddingsStore, WasmMetrics}; +use crate::filter_context::EmbeddingsStore; +use crate::metrics::Metrics; use acap::cos; use common::api::hallucination::{ extract_messages_for_hallucination, HallucinationClassificationRequest, @@ -67,7 +68,7 @@ pub struct StreamContext { pub prompt_targets: Rc>, pub embeddings_store: Option>, overrides: Rc>, - pub metrics: Rc, + pub metrics: Rc, pub callouts: RefCell>, pub context_id: u32, pub tool_calls: Option>, @@ -90,7 +91,7 @@ impl StreamContext { #[allow(clippy::too_many_arguments)] pub fn new( context_id: u32, - metrics: Rc, + metrics: Rc, system_prompt: Rc>, prompt_targets: Rc>, prompt_guards: Rc,