split wasm filter

2026-05-30 14:25:15 +02:00 · 2024-10-15 17:29:00 -07:00 · 2024-10-15 17:29:00 -07:00 · 0e04b09f56
commit 0e04b09f56
parent b1746b38b4
44 changed files with 6009 additions and 272 deletions
--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@ -0,0 +1,324 @@
+use crate::llm_providers::LlmProviders;
+use crate::ratelimit;
+use crate::stream_context::StreamContext;
+use log::debug;
+use proxy_wasm::traits::*;
+use proxy_wasm::types::*;
+use public_types::common_types::EmbeddingType;
+use public_types::configuration::{
+    Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget,
+};
+use public_types::consts::ARCH_INTERNAL_CLUSTER_NAME;
+use public_types::consts::ARCH_UPSTREAM_HOST_HEADER;
+use public_types::consts::DEFAULT_EMBEDDING_MODEL;
+use public_types::consts::MODEL_SERVER_NAME;
+use public_types::embeddings::{
+    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
+};
+use public_types::http::CallArgs;
+use public_types::http::Client;
+use public_types::stats::Counter;
+use public_types::stats::Gauge;
+use public_types::stats::IncrementingMetric;
+use std::cell::RefCell;
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
+use std::rc::Rc;
+use std::time::Duration;
+
+#[derive(Copy, Clone, Debug)]
+pub struct WasmMetrics {
+    pub active_http_calls: Gauge,
+    pub ratelimited_rq: Counter,
+}
+
+impl WasmMetrics {
+    fn new() -> WasmMetrics {
+        WasmMetrics {
+            active_http_calls: Gauge::new(String::from("active_http_calls")),
+            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
+        }
+    }
+}
+
+pub type EmbeddingTypeMap = HashMap<EmbeddingType, Vec<f64>>;
+pub type EmbeddingsStore = HashMap<String, EmbeddingTypeMap>;
+
+#[derive(Debug)]
+pub struct FilterCallContext {
+    pub prompt_target_name: String,
+    pub embedding_type: EmbeddingType,
+}
+
+#[derive(Debug)]
+pub struct FilterContext {
+    metrics: Rc<WasmMetrics>,
+    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
+    callouts: RefCell<HashMap<u32, FilterCallContext>>,
+    overrides: Rc<Option<Overrides>>,
+    system_prompt: Rc<Option<String>>,
+    prompt_targets: Rc<HashMap<String, PromptTarget>>,
+    mode: GatewayMode,
+    prompt_guards: Rc<PromptGuards>,
+    llm_providers: Option<Rc<LlmProviders>>,
+    embeddings_store: Option<Rc<EmbeddingsStore>>,
+    temp_embeddings_store: EmbeddingsStore,
+}
+
+impl FilterContext {
+    pub fn new() -> FilterContext {
+        FilterContext {
+            callouts: RefCell::new(HashMap::new()),
+            metrics: Rc::new(WasmMetrics::new()),
+            system_prompt: Rc::new(None),
+            prompt_targets: Rc::new(HashMap::new()),
+            overrides: Rc::new(None),
+            prompt_guards: Rc::new(PromptGuards::default()),
+            mode: GatewayMode::Prompt,
+            llm_providers: None,
+            embeddings_store: Some(Rc::new(HashMap::new())),
+            temp_embeddings_store: HashMap::new(),
+        }
+    }
+
+    fn process_prompt_targets(&self) {
+        for values in self.prompt_targets.iter() {
+            let prompt_target = values.1;
+            self.schedule_embeddings_call(
+                &prompt_target.name,
+                &prompt_target.description,
+                EmbeddingType::Description,
+            );
+        }
+    }
+
+    fn schedule_embeddings_call(
+        &self,
+        prompt_target_name: &str,
+        input: &str,
+        embedding_type: EmbeddingType,
+    ) {
+        let embeddings_input = CreateEmbeddingRequest {
+            input: Box::new(CreateEmbeddingRequestInput::String(String::from(input))),
+            model: String::from(DEFAULT_EMBEDDING_MODEL),
+            encoding_format: None,
+            dimensions: None,
+            user: None,
+        };
+        let json_data = serde_json::to_string(&embeddings_input).unwrap();
+
+        let call_args = CallArgs::new(
+            ARCH_INTERNAL_CLUSTER_NAME,
+            "/embeddings",
+            vec![
+                (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
+                (":method", "POST"),
+                (":path", "/embeddings"),
+                (":authority", MODEL_SERVER_NAME),
+                ("content-type", "application/json"),
+                ("x-envoy-upstream-rq-timeout-ms", "60000"),
+            ],
+            Some(json_data.as_bytes()),
+            vec![],
+            Duration::from_secs(60),
+        );
+
+        let call_context = crate::filter_context::FilterCallContext {
+            prompt_target_name: String::from(prompt_target_name),
+            embedding_type,
+        };
+
+        if let Err(error) = self.http_call(call_args, call_context) {
+            panic!("{error}")
+        }
+    }
+
+    fn embedding_response_handler(
+        &mut self,
+        body_size: usize,
+        embedding_type: EmbeddingType,
+        prompt_target_name: String,
+    ) {
+        let prompt_target = self
+            .prompt_targets
+            .get(&prompt_target_name)
+            .unwrap_or_else(|| {
+                panic!(
+                    "Received embeddings response for unknown prompt target name={}",
+                    prompt_target_name
+                )
+            });
+
+        let body = self
+            .get_http_call_response_body(0, body_size)
+            .expect("No body in response");
+        if !body.is_empty() {
+            let mut embedding_response: CreateEmbeddingResponse =
+                match serde_json::from_slice(&body) {
+                    Ok(response) => response,
+                    Err(e) => {
+                        panic!(
+                            "Error deserializing embedding response. body: {:?}: {:?}",
+                            String::from_utf8(body).unwrap(),
+                            e
+                        );
+                    }
+                };
+
+            let embeddings = embedding_response.data.remove(0).embedding;
+            debug!(
+                    "Adding embeddings for prompt target name: {:?}, description: {:?}, embedding type: {:?}",
+                    prompt_target.name,
+                    prompt_target.description,
+                    embedding_type
+                );
+
+            let entry = self.temp_embeddings_store.entry(prompt_target_name);
+            match entry {
+                Entry::Occupied(_) => {
+                    entry.and_modify(|e| {
+                        if let Entry::Vacant(e) = e.entry(embedding_type) {
+                            e.insert(embeddings);
+                        } else {
+                            panic!(
+                                "Duplicate {:?} for prompt target with name=\"{}\"",
+                                &embedding_type, prompt_target.name
+                            )
+                        }
+                    });
+                }
+                Entry::Vacant(_) => {
+                    entry.or_insert(HashMap::from([(embedding_type, embeddings)]));
+                }
+            }
+
+            if self.prompt_targets.len() == self.temp_embeddings_store.len() {
+                self.embeddings_store =
+                    Some(Rc::new(std::mem::take(&mut self.temp_embeddings_store)))
+            }
+        }
+    }
+}
+
+impl Client for FilterContext {
+    type CallContext = FilterCallContext;
+
+    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
+        &self.callouts
+    }
+
+    fn active_http_calls(&self) -> &Gauge {
+        &self.metrics.active_http_calls
+    }
+}
+
+impl Context for FilterContext {
+    fn on_http_call_response(
+        &mut self,
+        token_id: u32,
+        _num_headers: usize,
+        body_size: usize,
+        _num_trailers: usize,
+    ) {
+        debug!(
+            "filter_context: on_http_call_response called with token_id: {:?}",
+            token_id
+        );
+        let callout_data = self
+            .callouts
+            .borrow_mut()
+            .remove(&token_id)
+            .expect("invalid token_id");
+
+        self.metrics.active_http_calls.increment(-1);
+
+        self.embedding_response_handler(
+            body_size,
+            callout_data.embedding_type,
+            callout_data.prompt_target_name,
+        )
+    }
+}
+
+// RootContext allows the Rust code to reach into the Envoy Config
+impl RootContext for FilterContext {
+    fn on_configure(&mut self, _: usize) -> bool {
+        let config_bytes = self
+            .get_plugin_configuration()
+            .expect("Arch config cannot be empty");
+
+        let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
+            Ok(config) => config,
+            Err(err) => panic!("Invalid arch config \"{:?}\"", err),
+        };
+
+        self.overrides = Rc::new(config.overrides);
+
+        let mut prompt_targets = HashMap::new();
+        for pt in config.prompt_targets {
+            prompt_targets.insert(pt.name.clone(), pt.clone());
+        }
+        self.system_prompt = Rc::new(config.system_prompt);
+        self.prompt_targets = Rc::new(prompt_targets);
+        self.mode = config.mode.unwrap_or_default();
+
+        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
+
+        if let Some(prompt_guards) = config.prompt_guards {
+            self.prompt_guards = Rc::new(prompt_guards)
+        }
+
+        match config.llm_providers.try_into() {
+            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
+            Err(err) => panic!("{err}"),
+        }
+
+        true
+    }
+
+    fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
+        debug!(
+            "||| create_http_context called with context_id: {:?} |||",
+            context_id
+        );
+
+        // No StreamContext can be created until the Embedding Store is fully initialized.
+        let embedding_store = match self.mode {
+            GatewayMode::Llm => None,
+            GatewayMode::Prompt => Some(Rc::clone(self.embeddings_store.as_ref().unwrap())),
+        };
+        Some(Box::new(StreamContext::new(
+            context_id,
+            Rc::clone(&self.metrics),
+            Rc::clone(&self.system_prompt),
+            Rc::clone(&self.prompt_targets),
+            Rc::clone(&self.prompt_guards),
+            Rc::clone(&self.overrides),
+            Rc::clone(
+                self.llm_providers
+                    .as_ref()
+                    .expect("LLM Providers must exist when Streams are being created"),
+            ),
+            embedding_store,
+            self.mode.clone(),
+        )))
+    }
+
+    fn get_type(&self) -> Option<ContextType> {
+        Some(ContextType::HttpContext)
+    }
+
+    fn on_vm_start(&mut self, _: usize) -> bool {
+        self.set_tick_period(Duration::from_secs(1));
+        true
+    }
+
+    fn on_tick(&mut self) {
+        debug!("starting up arch filter in mode: {:?}", self.mode);
+        if self.mode == GatewayMode::Prompt {
+            self.process_prompt_targets();
+        }
+
+        self.set_tick_period(Duration::from_secs(0));
+    }
+}
--- a/crates/prompt_gateway/src/lib.rs
+++ b/crates/prompt_gateway/src/lib.rs
@ -0,0 +1,17 @@
+use filter_context::FilterContext;
+use proxy_wasm::traits::*;
+use proxy_wasm::types::*;
+
+mod filter_context;
+mod llm_providers;
+mod ratelimit;
+mod routing;
+mod stream_context;
+mod tokenizer;
+
+proxy_wasm::main! {{
+    proxy_wasm::set_log_level(LogLevel::Trace);
+    proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
+        Box::new(FilterContext::new())
+    });
+}}
--- a/crates/prompt_gateway/src/llm_providers.rs
+++ b/crates/prompt_gateway/src/llm_providers.rs
@ -0,0 +1,69 @@
+use public_types::configuration::LlmProvider;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+#[derive(Debug)]
+pub struct LlmProviders {
+    providers: HashMap<String, Rc<LlmProvider>>,
+    default: Option<Rc<LlmProvider>>,
+}
+
+impl LlmProviders {
+    pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Rc<LlmProvider>> {
+        self.providers.iter()
+    }
+
+    pub fn default(&self) -> Option<Rc<LlmProvider>> {
+        self.default.as_ref().map(|rc| rc.clone())
+    }
+
+    pub fn get(&self, name: &str) -> Option<Rc<LlmProvider>> {
+        self.providers.get(name).cloned()
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum LlmProvidersNewError {
+    #[error("There must be at least one LLM Provider")]
+    EmptySource,
+    #[error("There must be at most one default LLM Provider")]
+    MoreThanOneDefault,
+    #[error("\'{0}\' is not a unique name")]
+    DuplicateName(String),
+}
+
+impl TryFrom<Vec<LlmProvider>> for LlmProviders {
+    type Error = LlmProvidersNewError;
+
+    fn try_from(llm_providers_config: Vec<LlmProvider>) -> Result<Self, Self::Error> {
+        if llm_providers_config.is_empty() {
+            return Err(LlmProvidersNewError::EmptySource);
+        }
+
+        let mut llm_providers = LlmProviders {
+            providers: HashMap::new(),
+            default: None,
+        };
+
+        for llm_provider in llm_providers_config {
+            let llm_provider: Rc<LlmProvider> = Rc::new(llm_provider);
+            if llm_provider.default.unwrap_or_default() {
+                match llm_providers.default {
+                    Some(_) => return Err(LlmProvidersNewError::MoreThanOneDefault),
+                    None => llm_providers.default = Some(Rc::clone(&llm_provider)),
+                }
+            }
+
+            // Insert and check that there is no other provider with the same name.
+            let name = llm_provider.name.clone();
+            if llm_providers
+                .providers
+                .insert(name.clone(), llm_provider)
+                .is_some()
+            {
+                return Err(LlmProvidersNewError::DuplicateName(name));
+            }
+        }
+        Ok(llm_providers)
+    }
+}
--- a/crates/prompt_gateway/src/ratelimit.rs
+++ b/crates/prompt_gateway/src/ratelimit.rs
@ -0,0 +1,450 @@
+use governor::{DefaultKeyedRateLimiter, InsufficientCapacity, Quota};
+use log::debug;
+use public_types::configuration;
+use public_types::configuration::{Limit, Ratelimit, TimeUnit};
+use std::fmt::Display;
+use std::num::{NonZero, NonZeroU32};
+use std::sync::RwLock;
+use std::{collections::HashMap, sync::OnceLock};
+
+pub type RatelimitData = RwLock<RatelimitMap>;
+
+pub fn ratelimits(ratelimits_config: Option<Vec<Ratelimit>>) -> &'static RatelimitData {
+    static RATELIMIT_DATA: OnceLock<RatelimitData> = OnceLock::new();
+    RATELIMIT_DATA.get_or_init(|| {
+        RwLock::new(RatelimitMap::new(
+            ratelimits_config.expect("The initialization call has to have passed a config"),
+        ))
+    })
+}
+
+// The Data Structure is laid out in the following way:
+// Provider -> Hash { Header -> Limit }.
+// If the Header used to configure the given Limit:
+//   a) Has None value, then there will be N Limit keyed by the Header value.
+//   b) Has Some() value, then there will be 1 Limit keyed by the empty string.
+// It would have been nicer to use a non-keyed limit for b). However, the type system made that option a nightmare.
+pub struct RatelimitMap {
+    datastore: HashMap<String, HashMap<configuration::Header, DefaultKeyedRateLimiter<String>>>,
+}
+
+// This version of Header demands that the user passes a header value to match on.
+#[derive(Debug, Clone)]
+pub struct Header {
+    pub key: String,
+    pub value: String,
+}
+
+impl Display for Header {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl From<Header> for configuration::Header {
+    fn from(header: Header) -> Self {
+        Self {
+            key: header.key,
+            value: Some(header.value),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("exceeded limit provider={provider}, selector={selector}, tokens_used={tokens_used}")]
+    ExceededLimit {
+        provider: String,
+        selector: Header,
+        tokens_used: NonZeroU32,
+    },
+}
+
+impl RatelimitMap {
+    // n.b new is private so that the only access to the Ratelimits can be done via the static
+    // reference inside a RwLock via ratelimit::ratelimits().
+    fn new(ratelimits_config: Vec<Ratelimit>) -> Self {
+        let mut new_ratelimit_map = RatelimitMap {
+            datastore: HashMap::new(),
+        };
+        for ratelimit_config in ratelimits_config {
+            let limit = DefaultKeyedRateLimiter::keyed(get_quota(ratelimit_config.limit));
+
+            match new_ratelimit_map.datastore.get_mut(&ratelimit_config.model) {
+                Some(limits) => match limits.get_mut(&ratelimit_config.selector) {
+                    Some(_) => {
+                        panic!("repeated selector. Selectors per provider must be unique")
+                    }
+                    None => {
+                        limits.insert(ratelimit_config.selector, limit);
+                    }
+                },
+                None => {
+                    // The provider has not been seen before.
+                    // Insert the provider and a new HashMap with the specified limit
+                    let new_hash_map = HashMap::from([(ratelimit_config.selector, limit)]);
+                    new_ratelimit_map
+                        .datastore
+                        .insert(ratelimit_config.model, new_hash_map);
+                }
+            }
+        }
+        new_ratelimit_map
+    }
+
+    #[allow(unused)]
+    pub fn check_limit(
+        &self,
+        provider: String,
+        selector: Header,
+        tokens_used: NonZeroU32,
+    ) -> Result<(), Error> {
+        debug!(
+            "Checking limit for provider={}, with selector={:?}, consuming tokens={:?}",
+            provider, selector, tokens_used
+        );
+
+        let provider_limits = match self.datastore.get(&provider) {
+            None => {
+                // No limit configured for this provider, hence ok.
+                return Ok(());
+            }
+            Some(limit) => limit,
+        };
+
+        let mut config_selector = configuration::Header::from(selector.clone());
+
+        let (limit, limit_key) = match provider_limits.get(&config_selector) {
+            // This is a specific limit, i.e one that was configured with both key, and value.
+            // Therefore, the key for the internal limit does not matter, and hence the empty string is always returned.
+            Some(limit) => (limit, String::from("")),
+            None => {
+                // Unwrap is ok here because we _know_ the value exists.
+                let header_key = config_selector.value.take().unwrap();
+                // Search for less specific limit, i.e, one that was configured without a value, therefore every Header
+                // value has its own key in the internal limit.
+                match provider_limits.get(&config_selector) {
+                    Some(limit) => (limit, header_key),
+                    // No limit for that header key, value pair exists within that provider limits.
+                    None => {
+                        return Ok(());
+                    }
+                }
+            }
+        };
+
+        match limit.check_key_n(&limit_key, tokens_used) {
+            Ok(Ok(())) => Ok(()),
+            Ok(Err(_)) | Err(InsufficientCapacity(_)) => Err(Error::ExceededLimit {
+                provider,
+                selector,
+                tokens_used,
+            }),
+        }
+    }
+}
+
+fn get_quota(limit: Limit) -> Quota {
+    let tokens = NonZero::new(limit.tokens).expect("Limit's tokens must be positive");
+    match limit.unit {
+        TimeUnit::Second => Quota::per_second(tokens),
+        TimeUnit::Minute => Quota::per_minute(tokens),
+        TimeUnit::Hour => Quota::per_hour(tokens),
+    }
+}
+
+// The following tests are inside the ratelimit module in order to access RatelimitMap::new() in order to provide
+// different configuration values per test.
+#[test]
+fn non_existent_provider_is_ok() {
+    let ratelimits_config = vec![Ratelimit {
+        model: String::from("provider"),
+        selector: configuration::Header {
+            key: String::from("only-key"),
+            value: None,
+        },
+        limit: Limit {
+            tokens: 100,
+            unit: TimeUnit::Minute,
+        },
+    }];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("non-existent-provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(5000).unwrap(),
+        )
+        .is_ok())
+}
+
+#[test]
+fn non_existent_key_is_ok() {
+    let ratelimits_config = vec![Ratelimit {
+        model: String::from("provider"),
+        selector: configuration::Header {
+            key: String::from("only-key"),
+            value: None,
+        },
+        limit: Limit {
+            tokens: 100,
+            unit: TimeUnit::Minute,
+        },
+    }];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(5000).unwrap(),
+        )
+        .is_ok())
+}
+
+#[test]
+fn specific_limit_does_not_catch_non_specific_value() {
+    let ratelimits_config = vec![Ratelimit {
+        model: String::from("provider"),
+        selector: configuration::Header {
+            key: String::from("key"),
+            value: Some(String::from("value")),
+        },
+        limit: Limit {
+            tokens: 200,
+            unit: TimeUnit::Second,
+        },
+    }];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("not-the-correct-value"),
+            },
+            NonZero::new(5000).unwrap(),
+        )
+        .is_ok())
+}
+
+#[test]
+fn specific_limit_is_hit() {
+    let ratelimits_config = vec![Ratelimit {
+        model: String::from("provider"),
+        selector: configuration::Header {
+            key: String::from("key"),
+            value: Some(String::from("value")),
+        },
+        limit: Limit {
+            tokens: 200,
+            unit: TimeUnit::Hour,
+        },
+    }];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(5000).unwrap(),
+        )
+        .is_err())
+}
+
+#[test]
+fn non_specific_key_has_different_limits_for_different_values() {
+    let ratelimits_config = vec![Ratelimit {
+        model: String::from("provider"),
+        selector: configuration::Header {
+            key: String::from("only-key"),
+            value: None,
+        },
+        limit: Limit {
+            tokens: 100,
+            unit: TimeUnit::Hour,
+        },
+    }];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    // Value1 takes 50.
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("only-key"),
+                value: String::from("value1"),
+            },
+            NonZero::new(50).unwrap(),
+        )
+        .is_ok());
+
+    // value2 takes 60 because it has its own 100 limit
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("only-key"),
+                value: String::from("value2"),
+            },
+            NonZero::new(60).unwrap(),
+        )
+        .is_ok());
+
+    // However value1 cannot take more than 100 per hour which 50+70 = 120
+    assert!(ratelimits
+        .check_limit(
+            String::from("provider"),
+            Header {
+                key: String::from("only-key"),
+                value: String::from("value1"),
+            },
+            NonZero::new(70).unwrap(),
+        )
+        .is_err())
+}
+
+#[test]
+fn different_provider_can_have_different_limits_with_the_same_keys() {
+    let ratelimits_config = vec![
+        Ratelimit {
+            model: String::from("first_provider"),
+            selector: configuration::Header {
+                key: String::from("key"),
+                value: Some(String::from("value")),
+            },
+            limit: Limit {
+                tokens: 100,
+                unit: TimeUnit::Hour,
+            },
+        },
+        Ratelimit {
+            model: String::from("second_provider"),
+            selector: configuration::Header {
+                key: String::from("key"),
+                value: Some(String::from("value")),
+            },
+            limit: Limit {
+                tokens: 200,
+                unit: TimeUnit::Hour,
+            },
+        },
+    ];
+
+    let ratelimits = RatelimitMap::new(ratelimits_config);
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("first_provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(100).unwrap(),
+        )
+        .is_ok());
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("second_provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(200).unwrap(),
+        )
+        .is_ok());
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("first_provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(1).unwrap(),
+        )
+        .is_err());
+
+    assert!(ratelimits
+        .check_limit(
+            String::from("second_provider"),
+            Header {
+                key: String::from("key"),
+                value: String::from("value"),
+            },
+            NonZero::new(1).unwrap(),
+        )
+        .is_err());
+}
+
+// These tests use the publicly exposed static singleton, thus the same configuration is used in every test.
+// If more tests are written here, move the initial call out of the test.
+#[cfg(test)]
+mod test {
+    use super::ratelimits;
+    use configuration::{Limit, Ratelimit, TimeUnit};
+    use public_types::configuration;
+    use std::num::NonZero;
+    use std::thread;
+
+    #[test]
+    fn make_ratelimits_optional() {
+        let ratelimits_config = Vec::new();
+
+        // Initialize in the main thread.
+        ratelimits(Some(ratelimits_config));
+    }
+
+    #[test]
+    fn different_threads_have_same_ratelimit_data_structure() {
+        let ratelimits_config = Some(vec![Ratelimit {
+            model: String::from("provider"),
+            selector: configuration::Header {
+                key: String::from("key"),
+                value: Some(String::from("value")),
+            },
+            limit: Limit {
+                tokens: 200,
+                unit: TimeUnit::Hour,
+            },
+        }]);
+
+        // Initialize in the main thread.
+        ratelimits(ratelimits_config);
+
+        // Use the singleton in a different thread.
+        thread::spawn(|| {
+            let ratelimits = ratelimits(None);
+
+            assert!(ratelimits
+                .read()
+                .unwrap()
+                .check_limit(
+                    String::from("provider"),
+                    super::Header {
+                        key: String::from("key"),
+                        value: String::from("value"),
+                    },
+                    NonZero::new(5000).unwrap(),
+                )
+                .is_err())
+        });
+    }
+}
--- a/crates/prompt_gateway/src/routing.rs
+++ b/crates/prompt_gateway/src/routing.rs
@ -0,0 +1,50 @@
+use std::rc::Rc;
+
+use crate::llm_providers::LlmProviders;
+use log::debug;
+use public_types::configuration::LlmProvider;
+use rand::{seq::IteratorRandom, thread_rng};
+
+#[derive(Debug)]
+pub enum ProviderHint {
+    Default,
+    Name(String),
+}
+
+impl From<String> for ProviderHint {
+    fn from(value: String) -> Self {
+        match value.as_str() {
+            "default" => ProviderHint::Default,
+            _ => ProviderHint::Name(value),
+        }
+    }
+}
+
+pub fn get_llm_provider(
+    llm_providers: &LlmProviders,
+    provider_hint: Option<ProviderHint>,
+) -> Rc<LlmProvider> {
+    let maybe_provider = provider_hint.and_then(|hint| match hint {
+        ProviderHint::Default => llm_providers.default(),
+        // FIXME: should a non-existent name in the hint be more explicit? i.e, return a BAD_REQUEST?
+        ProviderHint::Name(name) => llm_providers.get(&name),
+    });
+
+    if let Some(provider) = maybe_provider {
+        return provider;
+    }
+
+    if llm_providers.default().is_some() {
+        debug!("no llm provider found for hint, using default llm provider");
+        return llm_providers.default().unwrap();
+    }
+
+    debug!("no default llm found, using random llm provider");
+    let mut rng = thread_rng();
+    llm_providers
+        .iter()
+        .choose(&mut rng)
+        .expect("There should always be at least one llm provider")
+        .1
+        .clone()
+}
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
--- a/crates/prompt_gateway/src/tokenizer.rs
+++ b/crates/prompt_gateway/src/tokenizer.rs
@ -0,0 +1,39 @@
+use log::debug;
+
+#[derive(Debug, PartialEq, Eq)]
+#[allow(dead_code)]
+pub enum Error {
+    UnknownModel,
+    FailedToTokenize,
+}
+
+#[allow(dead_code)]
+pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
+    debug!("getting token count model={}", model_name);
+    // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
+    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel)?;
+    Ok(bpe.encode_ordinary(text).len())
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn encode_ordinary() {
+        let model_name = "gpt-3.5-turbo";
+        let text = "How many tokens does this sentence have?";
+        assert_eq!(
+            8,
+            token_count(model_name, text).expect("correct tokenization")
+        );
+    }
+
+    #[test]
+    fn unrecognized_model() {
+        assert_eq!(
+            Error::UnknownModel,
+            token_count("unknown", "").expect_err("unknown model")
+        )
+    }
+}