Split arch wasm filter code into prompt and llm gateway filters (#190)

2026-07-02 15:51:02 +02:00 · 2024-10-17 10:16:40 -07:00 · 2024-10-17 10:16:40 -07:00 · 21e7fe2cef
commit 21e7fe2cef
parent 8e54ac20d8
13 changed files with 696 additions and 2801 deletions
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -1,322 +0,0 @@
-use crate::stream_context::StreamContext;
-use common::common_types::EmbeddingType;
-use common::configuration::{Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget};
-use common::consts::ARCH_INTERNAL_CLUSTER_NAME;
-use common::consts::ARCH_UPSTREAM_HOST_HEADER;
-use common::consts::DEFAULT_EMBEDDING_MODEL;
-use common::consts::MODEL_SERVER_NAME;
-use common::embeddings::{
-    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
-};
-use common::http::CallArgs;
-use common::http::Client;
-use common::llm_providers::LlmProviders;
-use common::ratelimit;
-use common::stats::Counter;
-use common::stats::Gauge;
-use common::stats::IncrementingMetric;
-use log::debug;
-use proxy_wasm::traits::*;
-use proxy_wasm::types::*;
-use std::cell::RefCell;
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::rc::Rc;
-use std::time::Duration;
-
-#[derive(Copy, Clone, Debug)]
-pub struct WasmMetrics {
-    pub active_http_calls: Gauge,
-    pub ratelimited_rq: Counter,
-}
-
-impl WasmMetrics {
-    fn new() -> WasmMetrics {
-        WasmMetrics {
-            active_http_calls: Gauge::new(String::from("active_http_calls")),
-            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
-        }
-    }
-}
-
-pub type EmbeddingTypeMap = HashMap<EmbeddingType, Vec<f64>>;
-pub type EmbeddingsStore = HashMap<String, EmbeddingTypeMap>;
-
-#[derive(Debug)]
-pub struct FilterCallContext {
-    pub prompt_target_name: String,
-    pub embedding_type: EmbeddingType,
-}
-
-#[derive(Debug)]
-pub struct FilterContext {
-    metrics: Rc<WasmMetrics>,
-    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
-    callouts: RefCell<HashMap<u32, FilterCallContext>>,
-    overrides: Rc<Option<Overrides>>,
-    system_prompt: Rc<Option<String>>,
-    prompt_targets: Rc<HashMap<String, PromptTarget>>,
-    mode: GatewayMode,
-    prompt_guards: Rc<PromptGuards>,
-    llm_providers: Option<Rc<LlmProviders>>,
-    embeddings_store: Option<Rc<EmbeddingsStore>>,
-    temp_embeddings_store: EmbeddingsStore,
-}
-
-impl FilterContext {
-    pub fn new() -> FilterContext {
-        FilterContext {
-            callouts: RefCell::new(HashMap::new()),
-            metrics: Rc::new(WasmMetrics::new()),
-            system_prompt: Rc::new(None),
-            prompt_targets: Rc::new(HashMap::new()),
-            overrides: Rc::new(None),
-            prompt_guards: Rc::new(PromptGuards::default()),
-            mode: GatewayMode::Prompt,
-            llm_providers: None,
-            embeddings_store: Some(Rc::new(HashMap::new())),
-            temp_embeddings_store: HashMap::new(),
-        }
-    }
-
-    fn process_prompt_targets(&self) {
-        for values in self.prompt_targets.iter() {
-            let prompt_target = values.1;
-            self.schedule_embeddings_call(
-                &prompt_target.name,
-                &prompt_target.description,
-                EmbeddingType::Description,
-            );
-        }
-    }
-
-    fn schedule_embeddings_call(
-        &self,
-        prompt_target_name: &str,
-        input: &str,
-        embedding_type: EmbeddingType,
-    ) {
-        let embeddings_input = CreateEmbeddingRequest {
-            input: Box::new(CreateEmbeddingRequestInput::String(String::from(input))),
-            model: String::from(DEFAULT_EMBEDDING_MODEL),
-            encoding_format: None,
-            dimensions: None,
-            user: None,
-        };
-        let json_data = serde_json::to_string(&embeddings_input).unwrap();
-
-        let call_args = CallArgs::new(
-            ARCH_INTERNAL_CLUSTER_NAME,
-            "/embeddings",
-            vec![
-                (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
-                (":method", "POST"),
-                (":path", "/embeddings"),
-                (":authority", MODEL_SERVER_NAME),
-                ("content-type", "application/json"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ],
-            Some(json_data.as_bytes()),
-            vec![],
-            Duration::from_secs(60),
-        );
-
-        let call_context = crate::filter_context::FilterCallContext {
-            prompt_target_name: String::from(prompt_target_name),
-            embedding_type,
-        };
-
-        if let Err(error) = self.http_call(call_args, call_context) {
-            panic!("{error}")
-        }
-    }
-
-    fn embedding_response_handler(
-        &mut self,
-        body_size: usize,
-        embedding_type: EmbeddingType,
-        prompt_target_name: String,
-    ) {
-        let prompt_target = self
-            .prompt_targets
-            .get(&prompt_target_name)
-            .unwrap_or_else(|| {
-                panic!(
-                    "Received embeddings response for unknown prompt target name={}",
-                    prompt_target_name
-                )
-            });
-
-        let body = self
-            .get_http_call_response_body(0, body_size)
-            .expect("No body in response");
-        if !body.is_empty() {
-            let mut embedding_response: CreateEmbeddingResponse =
-                match serde_json::from_slice(&body) {
-                    Ok(response) => response,
-                    Err(e) => {
-                        panic!(
-                            "Error deserializing embedding response. body: {:?}: {:?}",
-                            String::from_utf8(body).unwrap(),
-                            e
-                        );
-                    }
-                };
-
-            let embeddings = embedding_response.data.remove(0).embedding;
-            debug!(
-                    "Adding embeddings for prompt target name: {:?}, description: {:?}, embedding type: {:?}",
-                    prompt_target.name,
-                    prompt_target.description,
-                    embedding_type
-                );
-
-            let entry = self.temp_embeddings_store.entry(prompt_target_name);
-            match entry {
-                Entry::Occupied(_) => {
-                    entry.and_modify(|e| {
-                        if let Entry::Vacant(e) = e.entry(embedding_type) {
-                            e.insert(embeddings);
-                        } else {
-                            panic!(
-                                "Duplicate {:?} for prompt target with name=\"{}\"",
-                                &embedding_type, prompt_target.name
-                            )
-                        }
-                    });
-                }
-                Entry::Vacant(_) => {
-                    entry.or_insert(HashMap::from([(embedding_type, embeddings)]));
-                }
-            }
-
-            if self.prompt_targets.len() == self.temp_embeddings_store.len() {
-                self.embeddings_store =
-                    Some(Rc::new(std::mem::take(&mut self.temp_embeddings_store)))
-            }
-        }
-    }
-}
-
-impl Client for FilterContext {
-    type CallContext = FilterCallContext;
-
-    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
-        &self.callouts
-    }
-
-    fn active_http_calls(&self) -> &Gauge {
-        &self.metrics.active_http_calls
-    }
-}
-
-impl Context for FilterContext {
-    fn on_http_call_response(
-        &mut self,
-        token_id: u32,
-        _num_headers: usize,
-        body_size: usize,
-        _num_trailers: usize,
-    ) {
-        debug!(
-            "filter_context: on_http_call_response called with token_id: {:?}",
-            token_id
-        );
-        let callout_data = self
-            .callouts
-            .borrow_mut()
-            .remove(&token_id)
-            .expect("invalid token_id");
-
-        self.metrics.active_http_calls.increment(-1);
-
-        self.embedding_response_handler(
-            body_size,
-            callout_data.embedding_type,
-            callout_data.prompt_target_name,
-        )
-    }
-}
-
-// RootContext allows the Rust code to reach into the Envoy Config
-impl RootContext for FilterContext {
-    fn on_configure(&mut self, _: usize) -> bool {
-        let config_bytes = self
-            .get_plugin_configuration()
-            .expect("Arch config cannot be empty");
-
-        let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
-            Ok(config) => config,
-            Err(err) => panic!("Invalid arch config \"{:?}\"", err),
-        };
-
-        self.overrides = Rc::new(config.overrides);
-
-        let mut prompt_targets = HashMap::new();
-        for pt in config.prompt_targets {
-            prompt_targets.insert(pt.name.clone(), pt.clone());
-        }
-        self.system_prompt = Rc::new(config.system_prompt);
-        self.prompt_targets = Rc::new(prompt_targets);
-        self.mode = config.mode.unwrap_or_default();
-
-        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
-
-        if let Some(prompt_guards) = config.prompt_guards {
-            self.prompt_guards = Rc::new(prompt_guards)
-        }
-
-        match config.llm_providers.try_into() {
-            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
-            Err(err) => panic!("{err}"),
-        }
-
-        true
-    }
-
-    fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
-        debug!(
-            "||| create_http_context called with context_id: {:?} |||",
-            context_id
-        );
-
-        // No StreamContext can be created until the Embedding Store is fully initialized.
-        let embedding_store = match self.mode {
-            GatewayMode::Llm => None,
-            GatewayMode::Prompt => Some(Rc::clone(self.embeddings_store.as_ref().unwrap())),
-        };
-        Some(Box::new(StreamContext::new(
-            context_id,
-            Rc::clone(&self.metrics),
-            Rc::clone(&self.system_prompt),
-            Rc::clone(&self.prompt_targets),
-            Rc::clone(&self.prompt_guards),
-            Rc::clone(&self.overrides),
-            Rc::clone(
-                self.llm_providers
-                    .as_ref()
-                    .expect("LLM Providers must exist when Streams are being created"),
-            ),
-            embedding_store,
-            self.mode.clone(),
-        )))
-    }
-
-    fn get_type(&self) -> Option<ContextType> {
-        Some(ContextType::HttpContext)
-    }
-
-    fn on_vm_start(&mut self, _: usize) -> bool {
-        self.set_tick_period(Duration::from_secs(1));
-        true
-    }
-
-    fn on_tick(&mut self) {
-        debug!("starting up arch filter in mode: {:?}", self.mode);
-        if self.mode == GatewayMode::Prompt {
-            self.process_prompt_targets();
-        }
-
-        self.set_tick_period(Duration::from_secs(0));
-    }
-}
--- a/crates/llm_gateway/src/lib.rs
+++ b/crates/llm_gateway/src/lib.rs
@ -1,13 +1,13 @@
-use filter_context::FilterContext;
+use llm_filter_context::LlmGatewayFilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;

-mod filter_context;
-mod stream_context;
+mod llm_filter_context;
+mod llm_stream_context;

 proxy_wasm::main! {{
    proxy_wasm::set_log_level(LogLevel::Trace);
    proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
-        Box::new(FilterContext::new())
+        Box::new(LlmGatewayFilterContext::new())
    });
 }}
--- a/crates/llm_gateway/src/llm_filter_context.rs
+++ b/crates/llm_gateway/src/llm_filter_context.rs
@ -0,0 +1,108 @@
+use crate::llm_stream_context::LlmGatewayStreamContext;
+use common::configuration::Configuration;
+use common::http::Client;
+use common::llm_providers::LlmProviders;
+use common::ratelimit;
+use common::stats::Counter;
+use common::stats::Gauge;
+use log::debug;
+use proxy_wasm::traits::*;
+use proxy_wasm::types::*;
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+#[derive(Copy, Clone, Debug)]
+pub struct WasmMetrics {
+    pub active_http_calls: Gauge,
+    pub ratelimited_rq: Counter,
+}
+
+impl WasmMetrics {
+    fn new() -> WasmMetrics {
+        WasmMetrics {
+            active_http_calls: Gauge::new(String::from("active_http_calls")),
+            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct FilterCallContext {}
+
+#[derive(Debug)]
+pub struct LlmGatewayFilterContext {
+    metrics: Rc<WasmMetrics>,
+    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
+    callouts: RefCell<HashMap<u32, FilterCallContext>>,
+    llm_providers: Option<Rc<LlmProviders>>,
+}
+
+impl LlmGatewayFilterContext {
+    pub fn new() -> LlmGatewayFilterContext {
+        LlmGatewayFilterContext {
+            callouts: RefCell::new(HashMap::new()),
+            metrics: Rc::new(WasmMetrics::new()),
+            llm_providers: None,
+        }
+    }
+}
+
+impl Client for LlmGatewayFilterContext {
+    type CallContext = FilterCallContext;
+
+    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
+        &self.callouts
+    }
+
+    fn active_http_calls(&self) -> &Gauge {
+        &self.metrics.active_http_calls
+    }
+}
+
+impl Context for LlmGatewayFilterContext {}
+
+// RootContext allows the Rust code to reach into the Envoy Config
+impl RootContext for LlmGatewayFilterContext {
+    fn on_configure(&mut self, _: usize) -> bool {
+        let config_bytes = self
+            .get_plugin_configuration()
+            .expect("Arch config cannot be empty");
+
+        let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
+            Ok(config) => config,
+            Err(err) => panic!("Invalid arch config \"{:?}\"", err),
+        };
+
+        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
+
+        match config.llm_providers.try_into() {
+            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
+            Err(err) => panic!("{err}"),
+        }
+
+        true
+    }
+
+    fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
+        debug!(
+            "||| create_http_context called with context_id: {:?} |||",
+            context_id
+        );
+
+        // No StreamContext can be created until the Embedding Store is fully initialized.
+        Some(Box::new(LlmGatewayStreamContext::new(
+            context_id,
+            Rc::clone(&self.metrics),
+            Rc::clone(
+                self.llm_providers
+                    .as_ref()
+                    .expect("LLM Providers must exist when Streams are being created"),
+            ),
+        )))
+    }
+
+    fn get_type(&self) -> Option<ContextType> {
+        Some(ContextType::HttpContext)
+    }
+}
--- a/crates/llm_gateway/src/llm_stream_context.rs
+++ b/crates/llm_gateway/src/llm_stream_context.rs
@ -0,0 +1,421 @@
+use crate::llm_filter_context::WasmMetrics;
+use common::common_types::open_ai::{
+    ArchState, ChatCompletionChunkResponse, ChatCompletionsRequest, ChatCompletionsResponse,
+    Message, ToolCall, ToolCallState,
+};
+use common::configuration::LlmProvider;
+use common::consts::{
+    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, CHAT_COMPLETIONS_PATH,
+    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, USER_ROLE,
+};
+use common::llm_providers::LlmProviders;
+use common::ratelimit::Header;
+use common::{ratelimit, routing, tokenizer};
+use http::StatusCode;
+use log::debug;
+use proxy_wasm::traits::*;
+use proxy_wasm::types::*;
+use serde_json::Value;
+use sha2::{Digest, Sha256};
+use std::num::NonZero;
+use std::rc::Rc;
+
+use common::stats::IncrementingMetric;
+
+#[derive(thiserror::Error, Debug)]
+pub enum ServerError {
+    #[error(transparent)]
+    Deserialization(serde_json::Error),
+    #[error("{0}")]
+    LogicError(String),
+    #[error(transparent)]
+    ExceededRatelimit(ratelimit::Error),
+    #[error("{why}")]
+    BadRequest { why: String },
+}
+
+pub struct LlmGatewayStreamContext {
+    context_id: u32,
+    metrics: Rc<WasmMetrics>,
+    tool_calls: Option<Vec<ToolCall>>,
+    tool_call_response: Option<String>,
+    arch_state: Option<Vec<ArchState>>,
+    request_body_size: usize,
+    ratelimit_selector: Option<Header>,
+    streaming_response: bool,
+    user_prompt: Option<Message>,
+    response_tokens: usize,
+    is_chat_completions_request: bool,
+    chat_completions_request: Option<ChatCompletionsRequest>,
+    llm_providers: Rc<LlmProviders>,
+    llm_provider: Option<Rc<LlmProvider>>,
+    request_id: Option<String>,
+}
+
+impl LlmGatewayStreamContext {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(context_id: u32, metrics: Rc<WasmMetrics>, llm_providers: Rc<LlmProviders>) -> Self {
+        LlmGatewayStreamContext {
+            context_id,
+            metrics,
+            chat_completions_request: None,
+            tool_calls: None,
+            tool_call_response: None,
+            arch_state: None,
+            request_body_size: 0,
+            ratelimit_selector: None,
+            streaming_response: false,
+            user_prompt: None,
+            response_tokens: 0,
+            is_chat_completions_request: false,
+            llm_providers,
+            llm_provider: None,
+            request_id: None,
+        }
+    }
+    fn llm_provider(&self) -> &LlmProvider {
+        self.llm_provider
+            .as_ref()
+            .expect("the provider should be set when asked for it")
+    }
+
+    fn select_llm_provider(&mut self) {
+        let provider_hint = self
+            .get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
+            .map(|provider_name| provider_name.into());
+
+        debug!("llm provider hint: {:?}", provider_hint);
+        self.llm_provider = Some(routing::get_llm_provider(
+            &self.llm_providers,
+            provider_hint,
+        ));
+        debug!("selected llm: {}", self.llm_provider.as_ref().unwrap().name);
+    }
+
+    fn modify_auth_headers(&mut self) -> Result<(), ServerError> {
+        let llm_provider_api_key_value =
+            self.llm_provider()
+                .access_key
+                .as_ref()
+                .ok_or(ServerError::BadRequest {
+                    why: format!(
+                        "No access key configured for selected LLM Provider \"{}\"",
+                        self.llm_provider()
+                    ),
+                })?;
+
+        let authorization_header_value = format!("Bearer {}", llm_provider_api_key_value);
+
+        self.set_http_request_header("Authorization", Some(&authorization_header_value));
+
+        Ok(())
+    }
+
+    fn delete_content_length_header(&mut self) {
+        // Remove the Content-Length header because further body manipulations in the gateway logic will invalidate it.
+        // Server's generally throw away requests whose body length do not match the Content-Length header.
+        // However, a missing Content-Length header is not grounds for bad requests given that intermediary hops could
+        // manipulate the body in benign ways e.g., compression.
+        self.set_http_request_header("content-length", None);
+    }
+
+    fn save_ratelimit_header(&mut self) {
+        self.ratelimit_selector = self
+            .get_http_request_header(RATELIMIT_SELECTOR_HEADER_KEY)
+            .and_then(|key| {
+                self.get_http_request_header(&key)
+                    .map(|value| Header { key, value })
+            });
+    }
+
+    fn send_server_error(&self, error: ServerError, override_status_code: Option<StatusCode>) {
+        debug!("server error occurred: {}", error);
+        self.send_http_response(
+            override_status_code
+                .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)
+                .as_u16()
+                .into(),
+            vec![],
+            Some(format!("{error}").as_bytes()),
+        );
+    }
+
+    fn enforce_ratelimits(
+        &mut self,
+        model: &str,
+        json_string: &str,
+    ) -> Result<(), ratelimit::Error> {
+        if let Some(selector) = self.ratelimit_selector.take() {
+            // Tokenize and Ratelimit.
+            if let Ok(token_count) = tokenizer::token_count(model, json_string) {
+                ratelimit::ratelimits(None).read().unwrap().check_limit(
+                    model.to_owned(),
+                    selector,
+                    NonZero::new(token_count as u32).unwrap(),
+                )?;
+            }
+        }
+        Ok(())
+    }
+}
+
+// HttpContext is the trait that allows the Rust code to interact with HTTP objects.
+impl HttpContext for LlmGatewayStreamContext {
+    // Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
+    // the lifecycle of the http request and response.
+    fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
+        self.select_llm_provider();
+        self.add_http_request_header(ARCH_ROUTING_HEADER, &self.llm_provider().name);
+
+        if let Err(error) = self.modify_auth_headers() {
+            self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
+        }
+        self.delete_content_length_header();
+        self.save_ratelimit_header();
+
+        self.is_chat_completions_request =
+            self.get_http_request_header(":path").unwrap_or_default() == CHAT_COMPLETIONS_PATH;
+
+        debug!(
+            "on_http_request_headers S[{}] req_headers={:?}",
+            self.context_id,
+            self.get_http_request_headers()
+        );
+
+        self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
+
+        Action::Continue
+    }
+
+    fn on_http_request_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
+        // Let the client send the gateway all the data before sending to the LLM_provider.
+        // TODO: consider a streaming API.
+        if !end_of_stream {
+            return Action::Pause;
+        }
+
+        if body_size == 0 {
+            return Action::Continue;
+        }
+
+        self.request_body_size = body_size;
+
+        // Deserialize body into spec.
+        // Currently OpenAI API.
+        let mut deserialized_body: ChatCompletionsRequest =
+            match self.get_http_request_body(0, body_size) {
+                Some(body_bytes) => match serde_json::from_slice(&body_bytes) {
+                    Ok(deserialized) => deserialized,
+                    Err(e) => {
+                        self.send_server_error(
+                            ServerError::Deserialization(e),
+                            Some(StatusCode::BAD_REQUEST),
+                        );
+                        return Action::Pause;
+                    }
+                },
+                None => {
+                    self.send_server_error(
+                        ServerError::LogicError(format!(
+                            "Failed to obtain body bytes even though body_size is {}",
+                            body_size
+                        )),
+                        None,
+                    );
+                    return Action::Pause;
+                }
+            };
+        self.is_chat_completions_request = true;
+
+        // remove metadata from the request body
+        deserialized_body.metadata = None;
+        // delete model key from message array
+        for message in deserialized_body.messages.iter_mut() {
+            message.model = None;
+        }
+
+        // override model name from the llm provider
+        deserialized_body
+            .model
+            .clone_from(&self.llm_provider.as_ref().unwrap().model);
+        let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
+
+        // enforce ratelimits on ingress
+        if let Err(e) =
+            self.enforce_ratelimits(&deserialized_body.model, &chat_completion_request_str)
+        {
+            self.send_server_error(
+                ServerError::ExceededRatelimit(e),
+                Some(StatusCode::TOO_MANY_REQUESTS),
+            );
+            self.metrics.ratelimited_rq.increment(1);
+            return Action::Continue;
+        }
+
+        debug!(
+            "arch => {:?}, body: {}",
+            deserialized_body.model, chat_completion_request_str
+        );
+        self.set_http_request_body(0, body_size, chat_completion_request_str.as_bytes());
+
+        Action::Continue
+    }
+
+    fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
+        debug!(
+            "recv [S={}] bytes={} end_stream={}",
+            self.context_id, body_size, end_of_stream
+        );
+
+        if !self.is_chat_completions_request {
+            if let Some(body_str) = self
+                .get_http_response_body(0, body_size)
+                .and_then(|bytes| String::from_utf8(bytes).ok())
+            {
+                debug!("recv [S={}] body_str={}", self.context_id, body_str);
+            }
+            return Action::Continue;
+        }
+
+        if !end_of_stream {
+            return Action::Pause;
+        }
+
+        let body = self
+            .get_http_response_body(0, body_size)
+            .expect("cant get response body");
+
+        if self.streaming_response {
+            let body_str = String::from_utf8(body).expect("body is not utf-8");
+            debug!("streaming response");
+            let chat_completions_data = match body_str.split_once("data: ") {
+                Some((_, chat_completions_data)) => chat_completions_data,
+                None => {
+                    self.send_server_error(
+                        ServerError::LogicError(String::from("parsing error in streaming data")),
+                        None,
+                    );
+                    return Action::Pause;
+                }
+            };
+
+            let chat_completions_chunk_response: ChatCompletionChunkResponse =
+                match serde_json::from_str(chat_completions_data) {
+                    Ok(de) => de,
+                    Err(_) => {
+                        if chat_completions_data != "[NONE]" {
+                            self.send_server_error(
+                                ServerError::LogicError(String::from(
+                                    "error in streaming response",
+                                )),
+                                None,
+                            );
+                            return Action::Continue;
+                        }
+                        return Action::Continue;
+                    }
+                };
+
+            if let Some(content) = chat_completions_chunk_response
+                .choices
+                .first()
+                .unwrap()
+                .delta
+                .content
+                .as_ref()
+            {
+                let model = &chat_completions_chunk_response.model;
+                let token_count = tokenizer::token_count(model, content).unwrap_or(0);
+                self.response_tokens += token_count;
+            }
+        } else {
+            debug!("non streaming response");
+            let chat_completions_response: ChatCompletionsResponse =
+                match serde_json::from_slice(&body) {
+                    Ok(de) => de,
+                    Err(e) => {
+                        debug!("invalid response: {}", String::from_utf8_lossy(&body));
+                        self.send_server_error(ServerError::Deserialization(e), None);
+                        return Action::Pause;
+                    }
+                };
+
+            if chat_completions_response.usage.is_some() {
+                self.response_tokens += chat_completions_response
+                    .usage
+                    .as_ref()
+                    .unwrap()
+                    .completion_tokens;
+            }
+
+            if let Some(tool_calls) = self.tool_calls.as_ref() {
+                if !tool_calls.is_empty() {
+                    if self.arch_state.is_none() {
+                        self.arch_state = Some(Vec::new());
+                    }
+
+                    // compute sha hash from message history
+                    let mut hasher = Sha256::new();
+                    let prompts: Vec<String> = self
+                        .chat_completions_request
+                        .as_ref()
+                        .unwrap()
+                        .messages
+                        .iter()
+                        .filter(|msg| msg.role == USER_ROLE)
+                        .map(|msg| msg.content.clone().unwrap())
+                        .collect();
+                    let prompts_merged = prompts.join("#.#");
+                    hasher.update(prompts_merged.clone());
+                    let hash_key = hasher.finalize();
+                    // conver hash to hex string
+                    let hash_key_str = format!("{:x}", hash_key);
+                    debug!("hash key: {}, prompts: {}", hash_key_str, prompts_merged);
+
+                    // create new tool call state
+                    let tool_call_state = ToolCallState {
+                        key: hash_key_str,
+                        message: self.user_prompt.clone(),
+                        tool_call: tool_calls[0].function.clone(),
+                        tool_response: self.tool_call_response.clone().unwrap(),
+                    };
+
+                    // push tool call state to arch state
+                    self.arch_state
+                        .as_mut()
+                        .unwrap()
+                        .push(ArchState::ToolCall(vec![tool_call_state]));
+
+                    let mut data: Value = serde_json::from_slice(&body).unwrap();
+                    // use serde::Value to manipulate the json object and ensure that we don't lose any data
+                    if let Value::Object(ref mut map) = data {
+                        // serialize arch state and add to metadata
+                        let arch_state_str = serde_json::to_string(&self.arch_state).unwrap();
+                        debug!("arch_state: {}", arch_state_str);
+                        let metadata = map
+                            .entry("metadata")
+                            .or_insert(Value::Object(serde_json::Map::new()));
+                        metadata.as_object_mut().unwrap().insert(
+                            ARCH_STATE_HEADER.to_string(),
+                            serde_json::Value::String(arch_state_str),
+                        );
+
+                        let data_serialized = serde_json::to_string(&data).unwrap();
+                        debug!("arch => user: {}", data_serialized);
+                        self.set_http_response_body(0, body_size, data_serialized.as_bytes());
+                    };
+                }
+            }
+        }
+
+        debug!(
+            "recv [S={}] total_tokens={} end_stream={}",
+            self.context_id, self.response_tokens, end_of_stream
+        );
+
+        // TODO:: ratelimit based on response tokens.
+        Action::Continue
+    }
+}
+
+impl Context for LlmGatewayStreamContext {}
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs