make tiktoken token counting optional via enable_token_counting override

By default, use cheap len/4 estimate for input token counting (metrics
and ratelimit). When enable_token_counting is set to true in overrides,
use tiktoken BPE for exact counts. This eliminates ~80ms of per-request
latency from tiktoken in the WASM filter while keeping metrics and
ratelimit functional.

Made-with: Cursor
This commit is contained in:
Adil Hafeez 2026-03-22 21:45:02 -07:00
parent 406fa92802
commit e5f3039924
3 changed files with 19 additions and 8 deletions

View file

@ -285,6 +285,9 @@ properties:
agent_orchestration_model:
type: string
description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
enable_token_counting:
type: boolean
description: "Enable tiktoken-based input token counting for metrics and rate limiting. Default is false."
system_prompt:
type: string
prompt_targets:

View file

@ -131,6 +131,7 @@ pub struct Overrides {
pub use_agent_orchestrator: Option<bool>,
pub llm_routing_model: Option<String>,
pub agent_orchestration_model: Option<String>,
pub enable_token_counting: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]

View file

@ -48,7 +48,7 @@ pub struct StreamContext {
ttft_time: Option<u128>,
traceparent: Option<String>,
request_body_sent_time: Option<u128>,
_overrides: Rc<Option<Overrides>>,
overrides: Rc<Option<Overrides>>,
user_message: Option<String>,
upstream_status_code: Option<StatusCode>,
binary_frame_decoder: Option<BedrockBinaryFrameDecoder<bytes::BytesMut>>,
@ -66,7 +66,7 @@ impl StreamContext {
) -> Self {
StreamContext {
metrics,
_overrides: overrides,
overrides,
ratelimit_selector: None,
streaming_response: false,
response_tokens: 0,
@ -269,22 +269,29 @@ impl StreamContext {
model: &str,
json_string: &str,
) -> Result<(), ratelimit::Error> {
// Tokenize and record token count.
let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);
let use_tiktoken = (*self.overrides)
.as_ref()
.and_then(|o| o.enable_token_counting)
.unwrap_or(false);
let token_count = if use_tiktoken {
tokenizer::token_count(model, json_string).unwrap_or(0)
} else {
json_string.len() / 4
};
debug!(
"request_id={}: token count, model='{}' input_tokens={}",
"request_id={}: token count, model='{}' input_tokens={} method={}",
self.request_identifier(),
model,
token_count
token_count,
if use_tiktoken { "tiktoken" } else { "estimate" }
);
// Record the token count to metrics.
self.metrics
.input_sequence_length
.record(token_count as u64);
// Check if rate limiting needs to be applied.
if let Some(selector) = self.ratelimit_selector.take() {
info!(
"request_id={}: ratelimit check, model='{}' selector='{}:{}'",