make tiktoken token counting optional via enable_token_counting override

By default, use cheap len/4 estimate for input token counting (metrics and ratelimit). When enable_token_counting is set to true in overrides, use tiktoken BPE for exact counts. This eliminates ~80ms of per-request latency from tiktoken in the WASM filter while keeping metrics and ratelimit functional. Made-with: Cursor
2026-06-14 15:15:15 +02:00 · 2026-03-22 21:45:02 -07:00 · 2026-03-22 21:45:02 -07:00 · e5f3039924
commit e5f3039924
parent 406fa92802
3 changed files with 19 additions and 8 deletions
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -285,6 +285,9 @@ properties:
      agent_orchestration_model:
        type: string
        description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
+      enable_token_counting:
+        type: boolean
+        description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false."
  system_prompt:
    type: string
  prompt_targets:
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -131,6 +131,7 @@ pub struct Overrides {
    pub use_agent_orchestrator: Option<bool>,
    pub llm_routing_model: Option<String>,
    pub agent_orchestration_model: Option<String>,
+    pub enable_token_counting: Option<bool>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -48,7 +48,7 @@ pub struct StreamContext {
    ttft_time: Option<u128>,
    traceparent: Option<String>,
    request_body_sent_time: Option<u128>,
-    _overrides: Rc<Option<Overrides>>,
+    overrides: Rc<Option<Overrides>>,
    user_message: Option<String>,
    upstream_status_code: Option<StatusCode>,
    binary_frame_decoder: Option<BedrockBinaryFrameDecoder<bytes::BytesMut>>,
@ -66,7 +66,7 @@ impl StreamContext {
    ) -> Self {
        StreamContext {
            metrics,
-            _overrides: overrides,
+            overrides,
            ratelimit_selector: None,
            streaming_response: false,
            response_tokens: 0,
@ -269,22 +269,29 @@ impl StreamContext {
        model: &str,
        json_string: &str,
    ) -> Result<(), ratelimit::Error> {
-        // Tokenize and record token count.
-        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
+        let use_tiktoken = (*self.overrides)
+            .as_ref()
+            .and_then(|o| o.enable_token_counting)
+            .unwrap_or(false);
+
+        let token_count = if use_tiktoken {
+            tokenizer::token_count(model, json_string).unwrap_or(0)
+        } else {
+            json_string.len() / 4
+        };

        debug!(
-            "request_id={}: token count, model='{}' input_tokens={}",
+            "request_id={}: token count, model='{}' input_tokens={} method={}",
            self.request_identifier(),
            model,
-            token_count
+            token_count,
+            if use_tiktoken { "tiktoken" } else { "estimate" }
        );

-        // Record the token count to metrics.
        self.metrics
            .input_sequence_length
            .record(token_count as u64);

-        // Check if rate limiting needs to be applied.
        if let Some(selector) = self.ratelimit_selector.take() {
            info!(
                "request_id={}: ratelimit check, model='{}' selector='{}:{}'",