diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 5190fecf..f7817a09 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -285,6 +285,9 @@ properties: agent_orchestration_model: type: string description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." + enable_token_counting: + type: boolean + description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false." system_prompt: type: string prompt_targets: diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index df179059..2d7a7f22 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -131,6 +131,7 @@ pub struct Overrides { pub use_agent_orchestrator: Option, pub llm_routing_model: Option, pub agent_orchestration_model: Option, + pub enable_token_counting: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index f62631fa..20b8d3e6 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -48,7 +48,7 @@ pub struct StreamContext { ttft_time: Option, traceparent: Option, request_body_sent_time: Option, - _overrides: Rc>, + overrides: Rc>, user_message: Option, upstream_status_code: Option, binary_frame_decoder: Option>, @@ -66,7 +66,7 @@ impl StreamContext { ) -> Self { StreamContext { metrics, - _overrides: overrides, + overrides, ratelimit_selector: None, streaming_response: false, response_tokens: 0, @@ -269,22 +269,29 @@ impl StreamContext { model: &str, json_string: &str, ) -> Result<(), ratelimit::Error> { - // Tokenize and record token count. - let token_count = tokenizer::token_count(model, json_string).unwrap_or(0); + let use_tiktoken = (*self.overrides) + .as_ref() + .and_then(|o| o.enable_token_counting) + .unwrap_or(false); + + let token_count = if use_tiktoken { + tokenizer::token_count(model, json_string).unwrap_or(0) + } else { + json_string.len() / 4 + }; debug!( - "request_id={}: token count, model='{}' input_tokens={}", + "request_id={}: token count, model='{}' input_tokens={} method={}", self.request_identifier(), model, - token_count + token_count, + if use_tiktoken { "tiktoken" } else { "estimate" } ); - // Record the token count to metrics. self.metrics .input_sequence_length .record(token_count as u64); - // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { info!( "request_id={}: ratelimit check, model='{}' selector='{}:{}'",