split wasm filter

2026-05-27 14:17:15 +02:00 · 2024-10-15 17:29:00 -07:00 · 2024-10-15 17:29:00 -07:00 · 0e04b09f56
commit 0e04b09f56
parent b1746b38b4
44 changed files with 6009 additions and 272 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
 crates/*/target*
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -12,13 +12,24 @@ jobs:
    steps:
      - name: Setup | Checkout
        uses: actions/checkout@v4
      - name: Setup | Rust
        run: rustup toolchain install stable --profile minimal
      - name: Setup | Install wasm toolchain
        run: rustup target add wasm32-wasi
-      - name: Build wasm module
+
-        run: cd arch && cargo build --release --target=wasm32-wasi
+      - name: Build wasm module for prompt_gateway
-      - name: Run Tests on arch
+        run: cd crates/prompt_gateway && cargo build --release --target=wasm32-wasi
-        run: cd arch && cargo test
+
-      - name: Run Tests on public_types
+      - name: Run Tests on public_types crate
-        run: cd public_types && cargo test
+        run: cd crates/public_types && cargo test
      - name: Run Tests on prompt_gateway crate
        run: cd crates/prompt_gateway && cargo test
      - name: Build wasm module for llm_gateway
        run: cd crates/llm_gateway && cargo build --release --target=wasm32-wasi
      - name: Run Tests on llm_gateway crate
        run: cd crates/llm_gateway && cargo test
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,4 @@
 arch/target
 arch/qdrant_data/
 public_types/target
 /venv/
 __pycache__
 grafana-data
@ -31,3 +29,4 @@ model_server/build
 model_server/dist
 arch_logs/
 dist/
 crates/*/target/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -8,23 +8,27 @@ repos:
      - id: trailing-whitespace
  - repo: local
    hooks:
      - id: cargo-fmt
        name: cargo-fmt
        language: system
        types: [file, rust]
-        entry: bash -c "cd arch && cargo fmt -p intelligent-prompt-gateway -- --check"
+        entry: bash -c "cd crates/llm_gateway && cargo fmt -- --check"
      - id: cargo-clippy
        name: cargo-clippy
        language: system
        types: [file, rust]
-        entry: bash -c "cd arch && cargo clippy -p intelligent-prompt-gateway --all"
+        entry: bash -c "cd crates/llm_gateway && cargo clippy --all"
      - id: cargo-test
        name: cargo-test
        language: system
        types: [file, rust]
        # --lib is to only test the library, since when integration tests are made,
        # they will be in a seperate tests directory
-        entry: bash -c "cd arch && cargo test -p intelligent-prompt-gateway --lib"
+        entry: bash -c "cd crates/llm_gateway && cargo test --lib"
  - repo: https://github.com/psf/black
    rev: 23.1.0
    hooks:
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -2,19 +2,18 @@
 FROM rust:1.80.0 as builder
 RUN rustup -v target add wasm32-wasi
 WORKDIR /arch
-COPY arch/src /arch/src
+COPY crates .
 COPY arch/Cargo.toml /arch/
 COPY arch/Cargo.lock /arch/
 COPY public_types /public_types
-RUN cargo build --release --target wasm32-wasi
+RUN cd prompt_gateway && cargo build --release --target wasm32-wasi
 RUN cd llm_gateway && cargo build --release --target wasm32-wasi
 # copy built filter into envoy image
 FROM envoyproxy/envoy:v1.31-latest as envoy
 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3-slim as arch
-COPY --from=builder /arch/target/wasm32-wasi/release/intelligent_prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm
+COPY --from=builder /arch/prompt_gateway/target/wasm32-wasi/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/llm_gateway/target/wasm32-wasi/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
 WORKDIR /config
 COPY arch/requirements.txt .
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -90,7 +90,7 @@ static_resources:
                            runtime: "envoy.wasm.runtime.v8"
                            code:
                              local:
-                                filename: "/etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm"
+                                filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@ -250,7 +250,7 @@ static_resources:
                            runtime: "envoy.wasm.runtime.v8"
                            code:
                              local:
-                                filename: "/etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm"
+                                filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
--- a/arch/grafana/datasource.yaml
+++ b/arch/grafana/datasource.yaml
@ -1,9 +0,0 @@
 apiVersion: 1
 datasources:
 - name: Prometheus
  type: prometheus
  url: http://prometheus:9090
  isDefault: true
  access: proxy
  editable: true
--- a/arch/prometheus/prometheus.yaml
+++ b/arch/prometheus/prometheus.yaml
@ -1,23 +0,0 @@
 global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
  - static_configs:
    - targets: []
    scheme: http
    timeout: 10s
    api_version: v1
 scrape_configs:
 - job_name: envoy
  honor_timestamps: true
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /stats
  scheme: http
  static_configs:
  - targets:
    - envoy:9901
  params:
    format: ['prometheus']
--- a/crates/llm_gateway/Cargo.lock
+++ b/crates/llm_gateway/Cargo.lock
@ -753,29 +753,6 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "intelligent-prompt-gateway"
 version = "0.1.0"
 dependencies = [
 "acap",
 "derivative",
 "governor",
 "http",
 "log",
 "md5",
 "proxy-wasm",
 "proxy-wasm-test-framework",
 "public_types",
 "rand",
 "serde",
 "serde_json",
 "serde_yaml",
 "serial_test",
 "sha2",
 "thiserror",
 "tiktoken-rs",
 ]
 [[package]]
 name = "itertools"
 version = "0.12.1"
@ -860,6 +837,29 @@ version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 [[package]]
 name = "llm_gateway"
 version = "0.1.0"
 dependencies = [
 "acap",
 "derivative",
 "governor",
 "http",
 "log",
 "md5",
 "proxy-wasm",
 "proxy-wasm-test-framework",
 "public_types",
 "rand",
 "serde",
 "serde_json",
 "serde_yaml",
 "serial_test",
 "sha2",
 "thiserror",
 "tiktoken-rs",
 ]
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@ -1098,9 +1098,14 @@ dependencies = [
 name = "public_types"
 version = "0.1.0"
 dependencies = [
 "derivative",
 "duration-string",
 "governor",
 "log",
 "proxy-wasm",
 "serde",
 "serde_yaml",
 "thiserror",
 ]
 [[package]]
--- a/crates/llm_gateway/Cargo.toml
+++ b/crates/llm_gateway/Cargo.toml
@ -1,5 +1,5 @@
 [package]
-name = "intelligent-prompt-gateway"
+name = "llm_gateway"
 version = "0.1.0"
 authors = ["Katanemo Inc <info@katanemo.com>"]
 edition = "2021"
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -1,11 +1,5 @@
 use crate::consts::{
    ARCH_INTERNAL_CLUSTER_NAME, ARCH_UPSTREAM_HOST_HEADER, DEFAULT_EMBEDDING_MODEL,
    MODEL_SERVER_NAME,
 };
 use crate::http::{CallArgs, Client};
 use crate::llm_providers::LlmProviders;
 use crate::ratelimit;
 use crate::stats::{Counter, Gauge, IncrementingMetric};
 use crate::stream_context::StreamContext;
 use log::debug;
 use proxy_wasm::traits::*;
@ -14,9 +8,18 @@ use public_types::common_types::EmbeddingType;
 use public_types::configuration::{
    Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget,
 };
 use public_types::consts::ARCH_INTERNAL_CLUSTER_NAME;
 use public_types::consts::ARCH_UPSTREAM_HOST_HEADER;
 use public_types::consts::DEFAULT_EMBEDDING_MODEL;
 use public_types::consts::MODEL_SERVER_NAME;
 use public_types::embeddings::{
    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
 };
 use public_types::http::CallArgs;
 use public_types::http::Client;
 use public_types::stats::Counter;
 use public_types::stats::Gauge;
 use public_types::stats::IncrementingMetric;
 use std::cell::RefCell;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
--- a/crates/llm_gateway/src/lib.rs
+++ b/crates/llm_gateway/src/lib.rs
@ -2,13 +2,10 @@ use filter_context::FilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 mod consts;
 mod filter_context;
 mod http;
 mod llm_providers;
 mod ratelimit;
 mod routing;
 mod stats;
 mod stream_context;
 mod tokenizer;
--- a/crates/llm_gateway/src/llm_providers.rs
+++ b/crates/llm_gateway/src/llm_providers.rs
--- a/crates/llm_gateway/src/ratelimit.rs
+++ b/crates/llm_gateway/src/ratelimit.rs
--- a/crates/llm_gateway/src/routing.rs
+++ b/crates/llm_gateway/src/routing.rs
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -1,16 +1,6 @@
 use crate::consts::{
    ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME,
    ARCH_LLM_UPSTREAM_LISTENER, ARCH_MESSAGES_KEY, ARCH_MODEL_PREFIX, ARCH_PROVIDER_HINT_HEADER,
    ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ARC_FC_CLUSTER,
    CHAT_COMPLETIONS_PATH, DEFAULT_EMBEDDING_MODEL, DEFAULT_HALLUCINATED_THRESHOLD,
    DEFAULT_INTENT_MODEL, DEFAULT_PROMPT_TARGET_THRESHOLD, GPT_35_TURBO, MODEL_SERVER_NAME,
    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE, USER_ROLE,
 };
 use crate::filter_context::{EmbeddingsStore, WasmMetrics};
 use crate::http::{CallArgs, Client, ClientError};
 use crate::llm_providers::LlmProviders;
 use crate::ratelimit::Header;
 use crate::stats::IncrementingMetric;
 use crate::{ratelimit, routing, tokenizer};
 use acap::cos;
 use http::StatusCode;
@ -29,9 +19,19 @@ use public_types::common_types::{
 };
 use public_types::configuration::{GatewayMode, LlmProvider};
 use public_types::configuration::{Overrides, PromptGuards, PromptTarget};
 use public_types::consts::{
    ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME,
    ARCH_LLM_UPSTREAM_LISTENER, ARCH_MESSAGES_KEY, ARCH_MODEL_PREFIX, ARCH_PROVIDER_HINT_HEADER,
    ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ARC_FC_CLUSTER,
    CHAT_COMPLETIONS_PATH, DEFAULT_EMBEDDING_MODEL, DEFAULT_HALLUCINATED_THRESHOLD,
    DEFAULT_INTENT_MODEL, DEFAULT_PROMPT_TARGET_THRESHOLD, GPT_35_TURBO, MODEL_SERVER_NAME,
    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE, USER_ROLE,
 };
 use public_types::embeddings::{
    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
 };
 use public_types::http::{CallArgs, Client, ClientError};
 use public_types::stats::Gauge;
 use serde_json::Value;
 use sha2::{Digest, Sha256};
 use std::cell::RefCell;
@ -40,6 +40,8 @@ use std::num::NonZero;
 use std::rc::Rc;
 use std::time::Duration;
 use public_types::stats::IncrementingMetric;
 #[derive(Debug, Clone)]
 enum ResponseHandlerType {
    GetEmbeddings,
@ -753,10 +755,8 @@ impl StreamContext {
                        }
                    }
                }
-            } else {
+            } else if let Some(user_message) = callout_context.user_message.as_ref() {
-                if let Some(user_message) = callout_context.user_message.as_ref() {
+                user_messages.push(user_message.clone());
                    user_messages.push(user_message.clone());
                }
            }
            let user_messages_str = user_messages.join(", ");
            debug!("user messages: {}", user_messages_str);
@ -1570,7 +1570,7 @@ impl Client for StreamContext {
        &self.callouts
    }
-    fn active_http_calls(&self) -> &crate::stats::Gauge {
+    fn active_http_calls(&self) -> &Gauge {
        &self.metrics.active_http_calls
    }
 }
--- a/crates/llm_gateway/src/tokenizer.rs
+++ b/crates/llm_gateway/src/tokenizer.rs
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
--- a/crates/prompt_gateway/Cargo.lock
+++ b/crates/prompt_gateway/Cargo.lock
--- a/crates/prompt_gateway/Cargo.toml
+++ b/crates/prompt_gateway/Cargo.toml
@ -0,0 +1,29 @@
 [package]
 name = "prompt_gateway"
 version = "0.1.0"
 authors = ["Katanemo Inc <info@katanemo.com>"]
 edition = "2021"
 [lib]
 crate-type = ["cdylib"]
 [dependencies]
 proxy-wasm = "0.2.1"
 log = "0.4"
 serde = { version = "1.0", features = ["derive"] }
 serde_yaml = "0.9.34"
 serde_json = "1.0"
 md5 = "0.7.0"
 public_types = { path = "../public_types" }
 http = "1.1.0"
 governor = { version = "0.6.3", default-features = false, features = ["no_std"]}
 tiktoken-rs = "0.5.9"
 acap = "0.3.0"
 rand = "0.8.5"
 thiserror = "1.0.64"
 derivative = "2.2.0"
 sha2 = "0.10.8"
 [dev-dependencies]
 proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }
 serial_test = "3.1.1"
--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@ -0,0 +1,324 @@
 use crate::llm_providers::LlmProviders;
 use crate::ratelimit;
 use crate::stream_context::StreamContext;
 use log::debug;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 use public_types::common_types::EmbeddingType;
 use public_types::configuration::{
    Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget,
 };
 use public_types::consts::ARCH_INTERNAL_CLUSTER_NAME;
 use public_types::consts::ARCH_UPSTREAM_HOST_HEADER;
 use public_types::consts::DEFAULT_EMBEDDING_MODEL;
 use public_types::consts::MODEL_SERVER_NAME;
 use public_types::embeddings::{
    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
 };
 use public_types::http::CallArgs;
 use public_types::http::Client;
 use public_types::stats::Counter;
 use public_types::stats::Gauge;
 use public_types::stats::IncrementingMetric;
 use std::cell::RefCell;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::rc::Rc;
 use std::time::Duration;
 #[derive(Copy, Clone, Debug)]
 pub struct WasmMetrics {
    pub active_http_calls: Gauge,
    pub ratelimited_rq: Counter,
 }
 impl WasmMetrics {
    fn new() -> WasmMetrics {
        WasmMetrics {
            active_http_calls: Gauge::new(String::from("active_http_calls")),
            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
        }
    }
 }
 pub type EmbeddingTypeMap = HashMap<EmbeddingType, Vec<f64>>;
 pub type EmbeddingsStore = HashMap<String, EmbeddingTypeMap>;
 #[derive(Debug)]
 pub struct FilterCallContext {
    pub prompt_target_name: String,
    pub embedding_type: EmbeddingType,
 }
 #[derive(Debug)]
 pub struct FilterContext {
    metrics: Rc<WasmMetrics>,
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
    callouts: RefCell<HashMap<u32, FilterCallContext>>,
    overrides: Rc<Option<Overrides>>,
    system_prompt: Rc<Option<String>>,
    prompt_targets: Rc<HashMap<String, PromptTarget>>,
    mode: GatewayMode,
    prompt_guards: Rc<PromptGuards>,
    llm_providers: Option<Rc<LlmProviders>>,
    embeddings_store: Option<Rc<EmbeddingsStore>>,
    temp_embeddings_store: EmbeddingsStore,
 }
 impl FilterContext {
    pub fn new() -> FilterContext {
        FilterContext {
            callouts: RefCell::new(HashMap::new()),
            metrics: Rc::new(WasmMetrics::new()),
            system_prompt: Rc::new(None),
            prompt_targets: Rc::new(HashMap::new()),
            overrides: Rc::new(None),
            prompt_guards: Rc::new(PromptGuards::default()),
            mode: GatewayMode::Prompt,
            llm_providers: None,
            embeddings_store: Some(Rc::new(HashMap::new())),
            temp_embeddings_store: HashMap::new(),
        }
    }
    fn process_prompt_targets(&self) {
        for values in self.prompt_targets.iter() {
            let prompt_target = values.1;
            self.schedule_embeddings_call(
                &prompt_target.name,
                &prompt_target.description,
                EmbeddingType::Description,
            );
        }
    }
    fn schedule_embeddings_call(
        &self,
        prompt_target_name: &str,
        input: &str,
        embedding_type: EmbeddingType,
    ) {
        let embeddings_input = CreateEmbeddingRequest {
            input: Box::new(CreateEmbeddingRequestInput::String(String::from(input))),
            model: String::from(DEFAULT_EMBEDDING_MODEL),
            encoding_format: None,
            dimensions: None,
            user: None,
        };
        let json_data = serde_json::to_string(&embeddings_input).unwrap();
        let call_args = CallArgs::new(
            ARCH_INTERNAL_CLUSTER_NAME,
            "/embeddings",
            vec![
                (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
                (":method", "POST"),
                (":path", "/embeddings"),
                (":authority", MODEL_SERVER_NAME),
                ("content-type", "application/json"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ],
            Some(json_data.as_bytes()),
            vec![],
            Duration::from_secs(60),
        );
        let call_context = crate::filter_context::FilterCallContext {
            prompt_target_name: String::from(prompt_target_name),
            embedding_type,
        };
        if let Err(error) = self.http_call(call_args, call_context) {
            panic!("{error}")
        }
    }
    fn embedding_response_handler(
        &mut self,
        body_size: usize,
        embedding_type: EmbeddingType,
        prompt_target_name: String,
    ) {
        let prompt_target = self
            .prompt_targets
            .get(&prompt_target_name)
            .unwrap_or_else(|| {
                panic!(
                    "Received embeddings response for unknown prompt target name={}",
                    prompt_target_name
                )
            });
        let body = self
            .get_http_call_response_body(0, body_size)
            .expect("No body in response");
        if !body.is_empty() {
            let mut embedding_response: CreateEmbeddingResponse =
                match serde_json::from_slice(&body) {
                    Ok(response) => response,
                    Err(e) => {
                        panic!(
                            "Error deserializing embedding response. body: {:?}: {:?}",
                            String::from_utf8(body).unwrap(),
                            e
                        );
                    }
                };
            let embeddings = embedding_response.data.remove(0).embedding;
            debug!(
                    "Adding embeddings for prompt target name: {:?}, description: {:?}, embedding type: {:?}",
                    prompt_target.name,
                    prompt_target.description,
                    embedding_type
                );
            let entry = self.temp_embeddings_store.entry(prompt_target_name);
            match entry {
                Entry::Occupied(_) => {
                    entry.and_modify(|e| {
                        if let Entry::Vacant(e) = e.entry(embedding_type) {
                            e.insert(embeddings);
                        } else {
                            panic!(
                                "Duplicate {:?} for prompt target with name=\"{}\"",
                                &embedding_type, prompt_target.name
                            )
                        }
                    });
                }
                Entry::Vacant(_) => {
                    entry.or_insert(HashMap::from([(embedding_type, embeddings)]));
                }
            }
            if self.prompt_targets.len() == self.temp_embeddings_store.len() {
                self.embeddings_store =
                    Some(Rc::new(std::mem::take(&mut self.temp_embeddings_store)))
            }
        }
    }
 }
 impl Client for FilterContext {
    type CallContext = FilterCallContext;
    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
        &self.callouts
    }
    fn active_http_calls(&self) -> &Gauge {
        &self.metrics.active_http_calls
    }
 }
 impl Context for FilterContext {
    fn on_http_call_response(
        &mut self,
        token_id: u32,
        _num_headers: usize,
        body_size: usize,
        _num_trailers: usize,
    ) {
        debug!(
            "filter_context: on_http_call_response called with token_id: {:?}",
            token_id
        );
        let callout_data = self
            .callouts
            .borrow_mut()
            .remove(&token_id)
            .expect("invalid token_id");
        self.metrics.active_http_calls.increment(-1);
        self.embedding_response_handler(
            body_size,
            callout_data.embedding_type,
            callout_data.prompt_target_name,
        )
    }
 }
 // RootContext allows the Rust code to reach into the Envoy Config
 impl RootContext for FilterContext {
    fn on_configure(&mut self, _: usize) -> bool {
        let config_bytes = self
            .get_plugin_configuration()
            .expect("Arch config cannot be empty");
        let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
            Ok(config) => config,
            Err(err) => panic!("Invalid arch config \"{:?}\"", err),
        };
        self.overrides = Rc::new(config.overrides);
        let mut prompt_targets = HashMap::new();
        for pt in config.prompt_targets {
            prompt_targets.insert(pt.name.clone(), pt.clone());
        }
        self.system_prompt = Rc::new(config.system_prompt);
        self.prompt_targets = Rc::new(prompt_targets);
        self.mode = config.mode.unwrap_or_default();
        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
        if let Some(prompt_guards) = config.prompt_guards {
            self.prompt_guards = Rc::new(prompt_guards)
        }
        match config.llm_providers.try_into() {
            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
            Err(err) => panic!("{err}"),
        }
        true
    }
    fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
        debug!(
            "||| create_http_context called with context_id: {:?} |||",
            context_id
        );
        // No StreamContext can be created until the Embedding Store is fully initialized.
        let embedding_store = match self.mode {
            GatewayMode::Llm => None,
            GatewayMode::Prompt => Some(Rc::clone(self.embeddings_store.as_ref().unwrap())),
        };
        Some(Box::new(StreamContext::new(
            context_id,
            Rc::clone(&self.metrics),
            Rc::clone(&self.system_prompt),
            Rc::clone(&self.prompt_targets),
            Rc::clone(&self.prompt_guards),
            Rc::clone(&self.overrides),
            Rc::clone(
                self.llm_providers
                    .as_ref()
                    .expect("LLM Providers must exist when Streams are being created"),
            ),
            embedding_store,
            self.mode.clone(),
        )))
    }
    fn get_type(&self) -> Option<ContextType> {
        Some(ContextType::HttpContext)
    }
    fn on_vm_start(&mut self, _: usize) -> bool {
        self.set_tick_period(Duration::from_secs(1));
        true
    }
    fn on_tick(&mut self) {
        debug!("starting up arch filter in mode: {:?}", self.mode);
        if self.mode == GatewayMode::Prompt {
            self.process_prompt_targets();
        }
        self.set_tick_period(Duration::from_secs(0));
    }
 }
--- a/crates/prompt_gateway/src/lib.rs
+++ b/crates/prompt_gateway/src/lib.rs
@ -0,0 +1,17 @@
 use filter_context::FilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 mod filter_context;
 mod llm_providers;
 mod ratelimit;
 mod routing;
 mod stream_context;
 mod tokenizer;
 proxy_wasm::main! {{
    proxy_wasm::set_log_level(LogLevel::Trace);
    proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
        Box::new(FilterContext::new())
    });
 }}
--- a/crates/prompt_gateway/src/llm_providers.rs
+++ b/crates/prompt_gateway/src/llm_providers.rs
@ -0,0 +1,69 @@
 use public_types::configuration::LlmProvider;
 use std::collections::HashMap;
 use std::rc::Rc;
 #[derive(Debug)]
 pub struct LlmProviders {
    providers: HashMap<String, Rc<LlmProvider>>,
    default: Option<Rc<LlmProvider>>,
 }
 impl LlmProviders {
    pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Rc<LlmProvider>> {
        self.providers.iter()
    }
    pub fn default(&self) -> Option<Rc<LlmProvider>> {
        self.default.as_ref().map(|rc| rc.clone())
    }
    pub fn get(&self, name: &str) -> Option<Rc<LlmProvider>> {
        self.providers.get(name).cloned()
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub enum LlmProvidersNewError {
    #[error("There must be at least one LLM Provider")]
    EmptySource,
    #[error("There must be at most one default LLM Provider")]
    MoreThanOneDefault,
    #[error("\'{0}\' is not a unique name")]
    DuplicateName(String),
 }
 impl TryFrom<Vec<LlmProvider>> for LlmProviders {
    type Error = LlmProvidersNewError;
    fn try_from(llm_providers_config: Vec<LlmProvider>) -> Result<Self, Self::Error> {
        if llm_providers_config.is_empty() {
            return Err(LlmProvidersNewError::EmptySource);
        }
        let mut llm_providers = LlmProviders {
            providers: HashMap::new(),
            default: None,
        };
        for llm_provider in llm_providers_config {
            let llm_provider: Rc<LlmProvider> = Rc::new(llm_provider);
            if llm_provider.default.unwrap_or_default() {
                match llm_providers.default {
                    Some(_) => return Err(LlmProvidersNewError::MoreThanOneDefault),
                    None => llm_providers.default = Some(Rc::clone(&llm_provider)),
                }
            }
            // Insert and check that there is no other provider with the same name.
            let name = llm_provider.name.clone();
            if llm_providers
                .providers
                .insert(name.clone(), llm_provider)
                .is_some()
            {
                return Err(LlmProvidersNewError::DuplicateName(name));
            }
        }
        Ok(llm_providers)
    }
 }
--- a/crates/prompt_gateway/src/ratelimit.rs
+++ b/crates/prompt_gateway/src/ratelimit.rs
@ -0,0 +1,450 @@
 use governor::{DefaultKeyedRateLimiter, InsufficientCapacity, Quota};
 use log::debug;
 use public_types::configuration;
 use public_types::configuration::{Limit, Ratelimit, TimeUnit};
 use std::fmt::Display;
 use std::num::{NonZero, NonZeroU32};
 use std::sync::RwLock;
 use std::{collections::HashMap, sync::OnceLock};
 pub type RatelimitData = RwLock<RatelimitMap>;
 pub fn ratelimits(ratelimits_config: Option<Vec<Ratelimit>>) -> &'static RatelimitData {
    static RATELIMIT_DATA: OnceLock<RatelimitData> = OnceLock::new();
    RATELIMIT_DATA.get_or_init(|| {
        RwLock::new(RatelimitMap::new(
            ratelimits_config.expect("The initialization call has to have passed a config"),
        ))
    })
 }
 // The Data Structure is laid out in the following way:
 // Provider -> Hash { Header -> Limit }.
 // If the Header used to configure the given Limit:
 //   a) Has None value, then there will be N Limit keyed by the Header value.
 //   b) Has Some() value, then there will be 1 Limit keyed by the empty string.
 // It would have been nicer to use a non-keyed limit for b). However, the type system made that option a nightmare.
 pub struct RatelimitMap {
    datastore: HashMap<String, HashMap<configuration::Header, DefaultKeyedRateLimiter<String>>>,
 }
 // This version of Header demands that the user passes a header value to match on.
 #[derive(Debug, Clone)]
 pub struct Header {
    pub key: String,
    pub value: String,
 }
 impl Display for Header {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{self:?}")
    }
 }
 impl From<Header> for configuration::Header {
    fn from(header: Header) -> Self {
        Self {
            key: header.key,
            value: Some(header.value),
        }
    }
 }
 #[derive(Debug, thiserror::Error)]
 pub enum Error {
    #[error("exceeded limit provider={provider}, selector={selector}, tokens_used={tokens_used}")]
    ExceededLimit {
        provider: String,
        selector: Header,
        tokens_used: NonZeroU32,
    },
 }
 impl RatelimitMap {
    // n.b new is private so that the only access to the Ratelimits can be done via the static
    // reference inside a RwLock via ratelimit::ratelimits().
    fn new(ratelimits_config: Vec<Ratelimit>) -> Self {
        let mut new_ratelimit_map = RatelimitMap {
            datastore: HashMap::new(),
        };
        for ratelimit_config in ratelimits_config {
            let limit = DefaultKeyedRateLimiter::keyed(get_quota(ratelimit_config.limit));
            match new_ratelimit_map.datastore.get_mut(&ratelimit_config.model) {
                Some(limits) => match limits.get_mut(&ratelimit_config.selector) {
                    Some(_) => {
                        panic!("repeated selector. Selectors per provider must be unique")
                    }
                    None => {
                        limits.insert(ratelimit_config.selector, limit);
                    }
                },
                None => {
                    // The provider has not been seen before.
                    // Insert the provider and a new HashMap with the specified limit
                    let new_hash_map = HashMap::from([(ratelimit_config.selector, limit)]);
                    new_ratelimit_map
                        .datastore
                        .insert(ratelimit_config.model, new_hash_map);
                }
            }
        }
        new_ratelimit_map
    }
    #[allow(unused)]
    pub fn check_limit(
        &self,
        provider: String,
        selector: Header,
        tokens_used: NonZeroU32,
    ) -> Result<(), Error> {
        debug!(
            "Checking limit for provider={}, with selector={:?}, consuming tokens={:?}",
            provider, selector, tokens_used
        );
        let provider_limits = match self.datastore.get(&provider) {
            None => {
                // No limit configured for this provider, hence ok.
                return Ok(());
            }
            Some(limit) => limit,
        };
        let mut config_selector = configuration::Header::from(selector.clone());
        let (limit, limit_key) = match provider_limits.get(&config_selector) {
            // This is a specific limit, i.e one that was configured with both key, and value.
            // Therefore, the key for the internal limit does not matter, and hence the empty string is always returned.
            Some(limit) => (limit, String::from("")),
            None => {
                // Unwrap is ok here because we _know_ the value exists.
                let header_key = config_selector.value.take().unwrap();
                // Search for less specific limit, i.e, one that was configured without a value, therefore every Header
                // value has its own key in the internal limit.
                match provider_limits.get(&config_selector) {
                    Some(limit) => (limit, header_key),
                    // No limit for that header key, value pair exists within that provider limits.
                    None => {
                        return Ok(());
                    }
                }
            }
        };
        match limit.check_key_n(&limit_key, tokens_used) {
            Ok(Ok(())) => Ok(()),
            Ok(Err(_)) | Err(InsufficientCapacity(_)) => Err(Error::ExceededLimit {
                provider,
                selector,
                tokens_used,
            }),
        }
    }
 }
 fn get_quota(limit: Limit) -> Quota {
    let tokens = NonZero::new(limit.tokens).expect("Limit's tokens must be positive");
    match limit.unit {
        TimeUnit::Second => Quota::per_second(tokens),
        TimeUnit::Minute => Quota::per_minute(tokens),
        TimeUnit::Hour => Quota::per_hour(tokens),
    }
 }
 // The following tests are inside the ratelimit module in order to access RatelimitMap::new() in order to provide
 // different configuration values per test.
 #[test]
 fn non_existent_provider_is_ok() {
    let ratelimits_config = vec![Ratelimit {
        model: String::from("provider"),
        selector: configuration::Header {
            key: String::from("only-key"),
            value: None,
        },
        limit: Limit {
            tokens: 100,
            unit: TimeUnit::Minute,
        },
    }];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    assert!(ratelimits
        .check_limit(
            String::from("non-existent-provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(5000).unwrap(),
        )
        .is_ok())
 }
 #[test]
 fn non_existent_key_is_ok() {
    let ratelimits_config = vec![Ratelimit {
        model: String::from("provider"),
        selector: configuration::Header {
            key: String::from("only-key"),
            value: None,
        },
        limit: Limit {
            tokens: 100,
            unit: TimeUnit::Minute,
        },
    }];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(5000).unwrap(),
        )
        .is_ok())
 }
 #[test]
 fn specific_limit_does_not_catch_non_specific_value() {
    let ratelimits_config = vec![Ratelimit {
        model: String::from("provider"),
        selector: configuration::Header {
            key: String::from("key"),
            value: Some(String::from("value")),
        },
        limit: Limit {
            tokens: 200,
            unit: TimeUnit::Second,
        },
    }];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("key"),
                value: String::from("not-the-correct-value"),
            },
            NonZero::new(5000).unwrap(),
        )
        .is_ok())
 }
 #[test]
 fn specific_limit_is_hit() {
    let ratelimits_config = vec![Ratelimit {
        model: String::from("provider"),
        selector: configuration::Header {
            key: String::from("key"),
            value: Some(String::from("value")),
        },
        limit: Limit {
            tokens: 200,
            unit: TimeUnit::Hour,
        },
    }];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(5000).unwrap(),
        )
        .is_err())
 }
 #[test]
 fn non_specific_key_has_different_limits_for_different_values() {
    let ratelimits_config = vec![Ratelimit {
        model: String::from("provider"),
        selector: configuration::Header {
            key: String::from("only-key"),
            value: None,
        },
        limit: Limit {
            tokens: 100,
            unit: TimeUnit::Hour,
        },
    }];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    // Value1 takes 50.
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("only-key"),
                value: String::from("value1"),
            },
            NonZero::new(50).unwrap(),
        )
        .is_ok());
    // value2 takes 60 because it has its own 100 limit
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("only-key"),
                value: String::from("value2"),
            },
            NonZero::new(60).unwrap(),
        )
        .is_ok());
    // However value1 cannot take more than 100 per hour which 50+70 = 120
    assert!(ratelimits
        .check_limit(
            String::from("provider"),
            Header {
                key: String::from("only-key"),
                value: String::from("value1"),
            },
            NonZero::new(70).unwrap(),
        )
        .is_err())
 }
 #[test]
 fn different_provider_can_have_different_limits_with_the_same_keys() {
    let ratelimits_config = vec![
        Ratelimit {
            model: String::from("first_provider"),
            selector: configuration::Header {
                key: String::from("key"),
                value: Some(String::from("value")),
            },
            limit: Limit {
                tokens: 100,
                unit: TimeUnit::Hour,
            },
        },
        Ratelimit {
            model: String::from("second_provider"),
            selector: configuration::Header {
                key: String::from("key"),
                value: Some(String::from("value")),
            },
            limit: Limit {
                tokens: 200,
                unit: TimeUnit::Hour,
            },
        },
    ];
    let ratelimits = RatelimitMap::new(ratelimits_config);
    assert!(ratelimits
        .check_limit(
            String::from("first_provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(100).unwrap(),
        )
        .is_ok());
    assert!(ratelimits
        .check_limit(
            String::from("second_provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(200).unwrap(),
        )
        .is_ok());
    assert!(ratelimits
        .check_limit(
            String::from("first_provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(1).unwrap(),
        )
        .is_err());
    assert!(ratelimits
        .check_limit(
            String::from("second_provider"),
            Header {
                key: String::from("key"),
                value: String::from("value"),
            },
            NonZero::new(1).unwrap(),
        )
        .is_err());
 }
 // These tests use the publicly exposed static singleton, thus the same configuration is used in every test.
 // If more tests are written here, move the initial call out of the test.
 #[cfg(test)]
 mod test {
    use super::ratelimits;
    use configuration::{Limit, Ratelimit, TimeUnit};
    use public_types::configuration;
    use std::num::NonZero;
    use std::thread;
    #[test]
    fn make_ratelimits_optional() {
        let ratelimits_config = Vec::new();
        // Initialize in the main thread.
        ratelimits(Some(ratelimits_config));
    }
    #[test]
    fn different_threads_have_same_ratelimit_data_structure() {
        let ratelimits_config = Some(vec![Ratelimit {
            model: String::from("provider"),
            selector: configuration::Header {
                key: String::from("key"),
                value: Some(String::from("value")),
            },
            limit: Limit {
                tokens: 200,
                unit: TimeUnit::Hour,
            },
        }]);
        // Initialize in the main thread.
        ratelimits(ratelimits_config);
        // Use the singleton in a different thread.
        thread::spawn(|| {
            let ratelimits = ratelimits(None);
            assert!(ratelimits
                .read()
                .unwrap()
                .check_limit(
                    String::from("provider"),
                    super::Header {
                        key: String::from("key"),
                        value: String::from("value"),
                    },
                    NonZero::new(5000).unwrap(),
                )
                .is_err())
        });
    }
 }
--- a/crates/prompt_gateway/src/routing.rs
+++ b/crates/prompt_gateway/src/routing.rs
@ -0,0 +1,50 @@
 use std::rc::Rc;
 use crate::llm_providers::LlmProviders;
 use log::debug;
 use public_types::configuration::LlmProvider;
 use rand::{seq::IteratorRandom, thread_rng};
 #[derive(Debug)]
 pub enum ProviderHint {
    Default,
    Name(String),
 }
 impl From<String> for ProviderHint {
    fn from(value: String) -> Self {
        match value.as_str() {
            "default" => ProviderHint::Default,
            _ => ProviderHint::Name(value),
        }
    }
 }
 pub fn get_llm_provider(
    llm_providers: &LlmProviders,
    provider_hint: Option<ProviderHint>,
 ) -> Rc<LlmProvider> {
    let maybe_provider = provider_hint.and_then(|hint| match hint {
        ProviderHint::Default => llm_providers.default(),
        // FIXME: should a non-existent name in the hint be more explicit? i.e, return a BAD_REQUEST?
        ProviderHint::Name(name) => llm_providers.get(&name),
    });
    if let Some(provider) = maybe_provider {
        return provider;
    }
    if llm_providers.default().is_some() {
        debug!("no llm provider found for hint, using default llm provider");
        return llm_providers.default().unwrap();
    }
    debug!("no default llm found, using random llm provider");
    let mut rng = thread_rng();
    llm_providers
        .iter()
        .choose(&mut rng)
        .expect("There should always be at least one llm provider")
        .1
        .clone()
 }
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
--- a/crates/prompt_gateway/src/tokenizer.rs
+++ b/crates/prompt_gateway/src/tokenizer.rs
@ -0,0 +1,39 @@
 use log::debug;
 #[derive(Debug, PartialEq, Eq)]
 #[allow(dead_code)]
 pub enum Error {
    UnknownModel,
    FailedToTokenize,
 }
 #[allow(dead_code)]
 pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
    debug!("getting token count model={}", model_name);
    // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel)?;
    Ok(bpe.encode_ordinary(text).len())
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn encode_ordinary() {
        let model_name = "gpt-3.5-turbo";
        let text = "How many tokens does this sentence have?";
        assert_eq!(
            8,
            token_count(model_name, text).expect("correct tokenization")
        );
    }
    #[test]
    fn unrecognized_model() {
        assert_eq!(
            Error::UnknownModel,
            token_count("unknown", "").expect_err("unknown model")
        )
    }
 }
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -0,0 +1,805 @@
 use http::StatusCode;
 use proxy_wasm_test_framework::tester::{self, Tester};
 use proxy_wasm_test_framework::types::{
    Action, BufferType, LogLevel, MapType, MetricType, ReturnType,
 };
 use public_types::common_types::open_ai::{ChatCompletionsResponse, Choice, Message, Usage};
 use public_types::common_types::open_ai::{FunctionCallDetail, ToolCall, ToolType};
 use public_types::common_types::{HallucinationClassificationResponse, PromptGuardResponse};
 use public_types::embeddings::{
    create_embedding_response, embedding, CreateEmbeddingResponse, CreateEmbeddingResponseUsage,
    Embedding,
 };
 use public_types::{common_types::ZeroShotClassificationResponse, configuration::Configuration};
 use serde_yaml::Value;
 use serial_test::serial;
 use std::collections::HashMap;
 use std::path::Path;
 fn wasm_module() -> String {
    let wasm_file = Path::new("target/wasm32-wasi/release/intelligent_prompt_gateway.wasm");
    assert!(
        wasm_file.exists(),
        "Run `cargo build --release --target=wasm32-wasi` first"
    );
    wasm_file.to_str().unwrap().to_string()
 }
 fn request_headers_expectations(module: &mut Tester, http_context: i32) {
    module
        .call_proxy_on_request_headers(http_context, 0, false)
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider-hint"),
        )
        .returning(Some("default"))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_add_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-upstream"),
            Some("arch_llm_listener"),
        )
        .expect_add_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider"),
            Some("open-ai-gpt-4"),
        )
        .expect_replace_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("Authorization"),
            Some("Bearer secret_key"),
        )
        .expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-ratelimit-selector"),
        )
        .returning(Some("selector-key"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("selector-key"))
        .returning(Some("selector-value"))
        .expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
        .returning(None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
        .returning(Some("/v1/chat/completions"))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
        .returning(None)
        .execute_and_expect(ReturnType::Action(Action::Continue))
        .unwrap();
 }
 fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
    module
        .call_proxy_on_context_create(http_context, filter_context)
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    request_headers_expectations(module, http_context);
    // Request Body
    let chat_completions_request_body = "\
 {\
    \"messages\": [\
    {\
        \"role\": \"system\",\
        \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
    },\
    {\
        \"role\": \"user\",\
        \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
    }\
    ],\
    \"model\": \"gpt-4\"\
 }";
    module
        .call_proxy_on_request_body(
            http_context,
            chat_completions_request_body.len() as i32,
            true,
        )
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
        // The actual call is not important in this test, we just need to grab the token_id
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/guard"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(1))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
    let prompt_guard_response = PromptGuardResponse {
        toxic_prob: None,
        toxic_verdict: None,
        jailbreak_prob: None,
        jailbreak_verdict: None,
    };
    let prompt_guard_response_buffer = serde_json::to_string(&prompt_guard_response).unwrap();
    module
        .call_proxy_on_http_call_response(
            http_context,
            1,
            0,
            prompt_guard_response_buffer.len() as i32,
            0,
        )
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&prompt_guard_response_buffer))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/embeddings"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(2))
        .expect_metric_increment("active_http_calls", 1)
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let embedding_response = CreateEmbeddingResponse {
        data: vec![Embedding {
            index: 0,
            embedding: vec![],
            object: embedding::Object::default(),
        }],
        model: String::from("test"),
        object: create_embedding_response::Object::default(),
        usage: Box::new(CreateEmbeddingResponseUsage::new(0, 0)),
    };
    let embeddings_response_buffer = serde_json::to_string(&embedding_response).unwrap();
    module
        .call_proxy_on_http_call_response(
            http_context,
            2,
            0,
            embeddings_response_buffer.len() as i32,
            0,
        )
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&embeddings_response_buffer))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/zeroshot"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(3))
        .expect_metric_increment("active_http_calls", 1)
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let zero_shot_response = ZeroShotClassificationResponse {
        predicted_class: "weather_forecast".to_string(),
        predicted_class_score: 0.1,
        scores: HashMap::new(),
        model: "test-model".to_string(),
    };
    let zeroshot_intent_detection_buffer = serde_json::to_string(&zero_shot_response).unwrap();
    module
        .call_proxy_on_http_call_response(
            http_context,
            3,
            0,
            zeroshot_intent_detection_buffer.len() as i32,
            0,
        )
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&zeroshot_intent_detection_buffer))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Info), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                (":method", "POST"),
                ("x-arch-upstream", "arch_fc"),
                (":path", "/v1/chat/completions"),
                (":authority", "arch_fc"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "120000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(4))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
 }
 fn setup_filter(module: &mut Tester, config: &str) -> i32 {
    let filter_context = 1;
    module
        .call_proxy_on_context_create(filter_context, 0)
        .expect_metric_creation(MetricType::Gauge, "active_http_calls")
        .expect_metric_creation(MetricType::Counter, "ratelimited_rq")
        .execute_and_expect(ReturnType::None)
        .unwrap();
    module
        .call_proxy_on_configure(filter_context, config.len() as i32)
        .expect_get_buffer_bytes(Some(BufferType::PluginConfiguration))
        .returning(Some(config))
        .execute_and_expect(ReturnType::Bool(true))
        .unwrap();
    module
        .call_proxy_on_tick(filter_context)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/embeddings"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(101))
        .expect_metric_increment("active_http_calls", 1)
        .expect_set_tick_period_millis(Some(0))
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let embedding_response = CreateEmbeddingResponse {
        data: vec![Embedding {
            embedding: vec![],
            index: 0,
            object: embedding::Object::default(),
        }],
        model: String::from("test"),
        object: create_embedding_response::Object::default(),
        usage: Box::new(CreateEmbeddingResponseUsage {
            prompt_tokens: 0,
            total_tokens: 0,
        }),
    };
    let embedding_response_str = serde_json::to_string(&embedding_response).unwrap();
    module
        .call_proxy_on_http_call_response(
            filter_context,
            101,
            0,
            embedding_response_str.len() as i32,
            0,
        )
        .expect_log(
            Some(LogLevel::Debug),
            Some(
                format!(
                    "filter_context: on_http_call_response called with token_id: {:?}",
                    101
                )
                .as_str(),
            ),
        )
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&embedding_response_str))
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    filter_context
 }
 fn default_config() -> &'static str {
    r#"
 version: "0.1-beta"
 listener:
  address: 0.0.0.0
  port: 10000
  message_format: huggingface
  connect_timeout: 0.005s
 endpoints:
  api_server:
    endpoint: api_server:80
    connect_timeout: 0.005s
 llm_providers:
  - name: open-ai-gpt-4
    provider: openai
    access_key: secret_key
    model: gpt-4
    default: true
 overrides:
  # confidence threshold for prompt target intent matching
  prompt_target_intent_matching_threshold: 0.6
 system_prompt: |
  You are a helpful assistant.
 prompt_guards:
  input_guards:
    jailbreak:
      on_exception:
        message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
 prompt_targets:
  - name: weather_forecast
    description: This function provides realtime weather forecast information for a given city.
    parameters:
      - name: city
        required: true
        description: The city for which the weather forecast is requested.
      - name: days
        description: The number of days for which the weather forecast is requested.
      - name: units
        description: The units in which the weather forecast is requested.
    endpoint:
      name: api_server
      path: /weather
    system_prompt: |
      You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
      - Use farenheight for temperature
      - Use miles per hour for wind speed
 ratelimits:
  - model: gpt-4
    selector:
      key: selector-key
      value: selector-value
    limit:
      tokens: 1
      unit: minute
 "#
 }
 #[test]
 #[serial]
 fn successful_request_to_open_ai_chat_completions() {
    let args = tester::MockSettings {
        wasm_path: wasm_module(),
        quiet: false,
        allow_unexpected: false,
    };
    let mut module = tester::mock(args).unwrap();
    module
        .call_start()
        .execute_and_expect(ReturnType::None)
        .unwrap();
    // Setup Filter
    let filter_context = setup_filter(&mut module, default_config());
    // Setup HTTP Stream
    let http_context = 2;
    module
        .call_proxy_on_context_create(http_context, filter_context)
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    request_headers_expectations(&mut module, http_context);
    // Request Body
    let chat_completions_request_body = "\
    {\
        \"messages\": [\
        {\
            \"role\": \"system\",\
            \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
        },\
        {\
            \"role\": \"user\",\
            \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
        }\
        ],\
        \"model\": \"gpt-4\"\
    }";
    module
        .call_proxy_on_request_body(
            http_context,
            chat_completions_request_body.len() as i32,
            true,
        )
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(Some("arch_internal"), None, None, None, None)
        .returning(Some(4))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
 }
 #[test]
 #[serial]
 fn bad_request_to_open_ai_chat_completions() {
    let args = tester::MockSettings {
        wasm_path: wasm_module(),
        quiet: false,
        allow_unexpected: false,
    };
    let mut module = tester::mock(args).unwrap();
    module
        .call_start()
        .execute_and_expect(ReturnType::None)
        .unwrap();
    // Setup Filter
    let filter_context = setup_filter(&mut module, default_config());
    // Setup HTTP Stream
    let http_context = 2;
    module
        .call_proxy_on_context_create(http_context, filter_context)
        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    request_headers_expectations(&mut module, http_context);
    // Request Body
    let incomplete_chat_completions_request_body = "\
    {\
        \"messages\": [\
        {\
            \"role\": \"system\",\
        },\
        {\
            \"role\": \"user\",\
            \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
        }\
        ]\
    }";
    module
        .call_proxy_on_request_body(
            http_context,
            incomplete_chat_completions_request_body.len() as i32,
            true,
        )
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(incomplete_chat_completions_request_body))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_send_local_response(
            Some(StatusCode::BAD_REQUEST.as_u16().into()),
            None,
            None,
            None,
        )
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
 }
 #[test]
 #[serial]
 fn request_ratelimited() {
    let args = tester::MockSettings {
        wasm_path: wasm_module(),
        quiet: false,
        allow_unexpected: false,
    };
    let mut module = tester::mock(args).unwrap();
    module
        .call_start()
        .execute_and_expect(ReturnType::None)
        .unwrap();
    // Setup Filter
    let filter_context = setup_filter(&mut module, default_config());
    // Setup HTTP Stream
    let http_context = 2;
    normal_flow(&mut module, filter_context, http_context);
    let arch_fc_resp = ChatCompletionsResponse {
        usage: Some(Usage {
            completion_tokens: 0,
        }),
        choices: vec![Choice {
            finish_reason: "test".to_string(),
            index: 0,
            message: Message {
                role: "system".to_string(),
                content: None,
                tool_calls: Some(vec![ToolCall {
                    id: String::from("test"),
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
                        arguments: HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
                        )]),
                    },
                }]),
                model: None,
            },
        }],
        model: String::from("test"),
        metadata: None,
    };
    let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
    module
        .call_proxy_on_http_call_response(http_context, 4, 0, arch_fc_resp_str.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&arch_fc_resp_str))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/hallucination"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(5))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let hallucatination_body = HallucinationClassificationResponse {
        params_scores: HashMap::from([("city".to_string(), 0.99)]),
        model: "nli-model".to_string(),
    };
    let body_text = serde_json::to_string(&hallucatination_body).unwrap();
    module
        .call_proxy_on_http_call_response(http_context, 5, 0, body_text.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "api_server"),
                (":method", "POST"),
                (":path", "/weather"),
                (":authority", "api_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(6))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let body_text = String::from("test body");
    module
        .call_proxy_on_http_call_response(http_context, 6, 0, body_text.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
        .expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
        .returning(Some("200"))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_send_local_response(
            Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
            None,
            None,
            None,
        )
        .expect_metric_increment("ratelimited_rq", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
 }
 #[test]
 #[serial]
 fn request_not_ratelimited() {
    let args = tester::MockSettings {
        wasm_path: wasm_module(),
        quiet: false,
        allow_unexpected: false,
    };
    let mut module = tester::mock(args).unwrap();
    module
        .call_start()
        .execute_and_expect(ReturnType::None)
        .unwrap();
    // Setup Filter
    let mut config: Configuration = serde_yaml::from_str(default_config()).unwrap();
    config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
    let config_str = serde_json::to_string(&config).unwrap();
    let filter_context = setup_filter(&mut module, &config_str);
    // Setup HTTP Stream
    let http_context = 2;
    normal_flow(&mut module, filter_context, http_context);
    let arch_fc_resp = ChatCompletionsResponse {
        usage: Some(Usage {
            completion_tokens: 0,
        }),
        choices: vec![Choice {
            finish_reason: "test".to_string(),
            index: 0,
            message: Message {
                role: "system".to_string(),
                content: None,
                tool_calls: Some(vec![ToolCall {
                    id: String::from("test"),
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
                        arguments: HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
                        )]),
                    },
                }]),
                model: None,
            },
        }],
        model: String::from("test"),
        metadata: None,
    };
    let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
    module
        .call_proxy_on_http_call_response(http_context, 4, 0, arch_fc_resp_str.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&arch_fc_resp_str))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
                (":path", "/hallucination"),
                (":authority", "model_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
                ("x-envoy-upstream-rq-timeout-ms", "60000"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(5))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    // hallucination should return that parameters were not halliucinated
    //     prompt: str
    // parameters: dict
    // model: str
    let hallucatination_body = HallucinationClassificationResponse {
        params_scores: HashMap::from([("city".to_string(), 0.99)]),
        model: "nli-model".to_string(),
    };
    let body_text = serde_json::to_string(&hallucatination_body).unwrap();
    module
        .call_proxy_on_http_call_response(http_context, 5, 0, body_text.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
                ("x-arch-upstream", "api_server"),
                (":method", "POST"),
                (":path", "/weather"),
                (":authority", "api_server"),
                ("content-type", "application/json"),
                ("x-envoy-max-retries", "3"),
            ]),
            None,
            None,
            None,
        )
        .returning(Some(6))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();
    let body_text = String::from("test body");
    module
        .call_proxy_on_http_call_response(http_context, 6, 0, body_text.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
        .expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
        .returning(Some("200"))
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
 }
--- a/crates/public_types/Cargo.lock
+++ b/crates/public_types/Cargo.lock
@ -0,0 +1,382 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "ahash"
 version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
 [[package]]
 name = "ahash"
 version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
 "cfg-if",
 "once_cell",
 "version_check",
 "zerocopy",
 ]
 [[package]]
 name = "allocator-api2"
 version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
 [[package]]
 name = "autocfg"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 [[package]]
 name = "derivative"
 version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 1.0.109",
 ]
 [[package]]
 name = "diff"
 version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
 [[package]]
 name = "duration-string"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fcc1d9ae294a15ed05aeae8e11ee5f2b3fe971c077d45a42fb20825fba6ee13"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "equivalent"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 [[package]]
 name = "governor"
 version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68a7f542ee6b35af73b06abc0dad1c1bae89964e4e253bc4b587b91c9637867b"
 dependencies = [
 "cfg-if",
 "no-std-compat",
 "nonzero_ext",
 "portable-atomic",
 "smallvec",
 "spinning_top",
 ]
 [[package]]
 name = "hashbrown"
 version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25"
 dependencies = [
 "ahash 0.3.8",
 "autocfg",
 ]
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
 "ahash 0.8.11",
 "allocator-api2",
 ]
 [[package]]
 name = "indexmap"
 version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5"
 dependencies = [
 "equivalent",
 "hashbrown 0.14.5",
 ]
 [[package]]
 name = "itoa"
 version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 [[package]]
 name = "lock_api"
 version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
 "autocfg",
 "scopeguard",
 ]
 [[package]]
 name = "log"
 version = "0.4.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
 [[package]]
 name = "memchr"
 version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 [[package]]
 name = "no-std-compat"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
 dependencies = [
 "hashbrown 0.8.2",
 ]
 [[package]]
 name = "nonzero_ext"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
 [[package]]
 name = "once_cell"
 version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 [[package]]
 name = "portable-atomic"
 version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
 [[package]]
 name = "pretty_assertions"
 version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d"
 dependencies = [
 "diff",
 "yansi",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "proxy-wasm"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14a5a4df5a1ab77235e36a0a0f638687ee1586d21ee9774037693001e94d4e11"
 dependencies = [
 "hashbrown 0.14.5",
 "log",
 ]
 [[package]]
 name = "public_types"
 version = "0.1.0"
 dependencies = [
 "derivative",
 "duration-string",
 "governor",
 "log",
 "pretty_assertions",
 "proxy-wasm",
 "serde",
 "serde_json",
 "serde_yaml",
 "thiserror",
 ]
 [[package]]
 name = "quote"
 version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "ryu"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 [[package]]
 name = "serde"
 version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
 dependencies = [
 "serde_derive",
 ]
 [[package]]
 name = "serde_derive"
 version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.77",
 ]
 [[package]]
 name = "serde_json"
 version = "1.0.128"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
 dependencies = [
 "itoa",
 "memchr",
 "ryu",
 "serde",
 ]
 [[package]]
 name = "serde_yaml"
 version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
 "indexmap",
 "itoa",
 "ryu",
 "serde",
 "unsafe-libyaml",
 ]
 [[package]]
 name = "smallvec"
 version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 [[package]]
 name = "spinning_top"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
 dependencies = [
 "lock_api",
 ]
 [[package]]
 name = "syn"
 version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "syn"
 version = "2.0.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "thiserror"
 version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
 version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.77",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
 [[package]]
 name = "version_check"
 version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 [[package]]
 name = "yansi"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
 [[package]]
 name = "zerocopy"
 version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
 "zerocopy-derive",
 ]
 [[package]]
 name = "zerocopy-derive"
 version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.77",
 ]
--- a/crates/public_types/Cargo.toml
+++ b/crates/public_types/Cargo.toml
@ -7,6 +7,11 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_yaml = "0.9.34"
 duration-string = { version = "0.3.0", features = ["serde"] }
 proxy-wasm = "0.2.1"
 governor = { version = "0.6.3", default-features = false, features = ["no_std"]}
 log = "0.4"
 derivative = "2.2.0"
 thiserror = "1.0.64"
 [dev-dependencies]
 pretty_assertions = "1.4.1"
--- a/crates/public_types/src/common_types.rs
+++ b/crates/public_types/src/common_types.rs
--- a/crates/public_types/src/configuration.rs
+++ b/crates/public_types/src/configuration.rs
--- a/crates/public_types/src/consts.rs
+++ b/crates/public_types/src/consts.rs
--- a/crates/public_types/src/embeddings/create_embedding_request.rs
+++ b/crates/public_types/src/embeddings/create_embedding_request.rs
--- a/crates/public_types/src/embeddings/create_embedding_request_input.rs
+++ b/crates/public_types/src/embeddings/create_embedding_request_input.rs
--- a/crates/public_types/src/embeddings/create_embedding_response.rs
+++ b/crates/public_types/src/embeddings/create_embedding_response.rs
--- a/crates/public_types/src/embeddings/create_embedding_response_usage.rs
+++ b/crates/public_types/src/embeddings/create_embedding_response_usage.rs
--- a/crates/public_types/src/embeddings/embedding.rs
+++ b/crates/public_types/src/embeddings/embedding.rs
--- a/crates/public_types/src/embeddings/mod.rs
+++ b/crates/public_types/src/embeddings/mod.rs
--- a/crates/public_types/src/http.rs
+++ b/crates/public_types/src/http.rs
--- a/crates/public_types/src/lib.rs
+++ b/crates/public_types/src/lib.rs
@ -3,3 +3,6 @@
 pub mod common_types;
 pub mod configuration;
 pub mod embeddings;
 pub mod consts;
 pub mod http;
 pub mod stats;
--- a/crates/public_types/src/stats.rs
+++ b/crates/public_types/src/stats.rs
--- a/gateway.code-workspace
+++ b/gateway.code-workspace
@ -5,8 +5,16 @@
 			"path": "."
 		},
    {
-      "name": "arch",
+      "name": "public_types",
-      "path": "arch"
+      "path": "crates/public_types"
    },
    {
      "name": "prompt_gateway",
      "path": "crates/prompt_gateway"
    },
    {
      "name": "llm_gateway",
      "path": "crates/prompt_gateway"
    },
    {
      "name": "arch/tools",
--- a/public_types/Cargo.lock
+++ b/public_types/Cargo.lock
@ -1,171 +0,0 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "diff"
 version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
 [[package]]
 name = "duration-string"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fcc1d9ae294a15ed05aeae8e11ee5f2b3fe971c077d45a42fb20825fba6ee13"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "equivalent"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 [[package]]
 name = "indexmap"
 version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5"
 dependencies = [
 "equivalent",
 "hashbrown",
 ]
 [[package]]
 name = "itoa"
 version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 [[package]]
 name = "memchr"
 version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 [[package]]
 name = "pretty_assertions"
 version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d"
 dependencies = [
 "diff",
 "yansi",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "public_types"
 version = "0.1.0"
 dependencies = [
 "duration-string",
 "pretty_assertions",
 "serde",
 "serde_json",
 "serde_yaml",
 ]
 [[package]]
 name = "quote"
 version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "ryu"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 [[package]]
 name = "serde"
 version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
 dependencies = [
 "serde_derive",
 ]
 [[package]]
 name = "serde_derive"
 version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "serde_json"
 version = "1.0.128"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
 dependencies = [
 "itoa",
 "memchr",
 "ryu",
 "serde",
 ]
 [[package]]
 name = "serde_yaml"
 version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
 "indexmap",
 "itoa",
 "ryu",
 "serde",
 "unsafe-libyaml",
 ]
 [[package]]
 name = "syn"
 version = "2.0.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
 [[package]]
 name = "yansi"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"