Merge branch 'main' into adil/developer_error

2026-06-17 15:25:17 +02:00 · 2024-10-18 12:57:04 -07:00 · 2024-10-18 12:57:04 -07:00 · e9c6796948
commit e9c6796948
parent acb7dd2be5 c6ba28dfcc
30 changed files with 2704 additions and 390 deletions
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
--- a/crates/Cargo.toml
+++ b/crates/Cargo.toml
@ -0,0 +1,3 @@
+[workspace]
+resolver = "2"
+members = ["llm_gateway", "prompt_gateway", "common"]
--- a/crates/common/Cargo.toml
+++ b/crates/common/Cargo.toml
@ -14,6 +14,7 @@ derivative = "2.2.0"
 thiserror = "1.0.64"
 tiktoken-rs = "0.5.9"
 rand = "0.8.5"
+serde_json = "1.0"

 [dev-dependencies]
 pretty_assertions = "1.4.1"
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -229,20 +229,6 @@ mod test {
        let config: super::Configuration = serde_yaml::from_str(&ref_config).unwrap();
        assert_eq!(config.version, "v0.1");

-        let open_ai_provider = config
-            .llm_providers
-            .iter()
-            .find(|p| p.name.to_lowercase() == "openai")
-            .unwrap();
-        assert_eq!(open_ai_provider.name.to_lowercase(), "openai");
-        assert_eq!(
-            open_ai_provider.access_key,
-            Some("OPENAI_API_KEY".to_string())
-        );
-        assert_eq!(open_ai_provider.model, "gpt-4o");
-        assert_eq!(open_ai_provider.default, Some(true));
-        assert_eq!(open_ai_provider.stream, Some(true));
-
        let prompt_guards = config.prompt_guards.as_ref().unwrap();
        let input_guards = &prompt_guards.input_guards;
        let jailbreak_guard = input_guards.get(&GuardType::Jailbreak).unwrap();
--- a/crates/common/src/errors.rs
+++ b/crates/common/src/errors.rs
@ -0,0 +1,39 @@
+use proxy_wasm::types::Status;
+
+use crate::ratelimit;
+
+#[derive(thiserror::Error, Debug)]
+pub enum ClientError {
+    #[error("Error dispatching HTTP call to `{upstream_name}/{path}`, error: {internal_status:?}")]
+    DispatchError {
+        upstream_name: String,
+        path: String,
+        internal_status: Status,
+    },
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ServerError {
+    #[error(transparent)]
+    HttpDispatch(ClientError),
+    #[error(transparent)]
+    Deserialization(serde_json::Error),
+    #[error(transparent)]
+    Serialization(serde_json::Error),
+    #[error("{0}")]
+    LogicError(String),
+    #[error("upstream error response authority={authority}, path={path}, status={status}")]
+    Upstream {
+        authority: String,
+        path: String,
+        status: String,
+    },
+    #[error("jailbreak detected: {0}")]
+    Jailbreak(String),
+    #[error("{why}")]
+    NoMessagesFound { why: String },
+    #[error(transparent)]
+    ExceededRatelimit(ratelimit::Error),
+    #[error("{why}")]
+    BadRequest { why: String },
+}
--- a/crates/common/src/http.rs
+++ b/crates/common/src/http.rs
@ -1,4 +1,4 @@
-use crate::stats::{Gauge, IncrementingMetric};
+use crate::{errors::ClientError, stats::{Gauge, IncrementingMetric}};
 use derivative::Derivative;
 use log::debug;
 use proxy_wasm::{traits::Context, types::Status};
@ -37,16 +37,6 @@ impl<'a> CallArgs<'a> {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum ClientError {
-    #[error("Error dispatching HTTP call to `{upstream_name}/{path}`, error: {internal_status:?}")]
-    DispatchError {
-        upstream_name: String,
-        path: String,
-        internal_status: Status,
-    },
-}
-
 pub trait Client: Context {
    type CallContext: Debug;

--- a/crates/common/src/lib.rs
+++ b/crates/common/src/lib.rs
@ -10,3 +10,4 @@ pub mod ratelimit;
 pub mod routing;
 pub mod stats;
 pub mod tokenizer;
+pub mod errors;
--- a/crates/llm_gateway/Cargo.lock
+++ b/crates/llm_gateway/Cargo.lock
@ -228,6 +228,7 @@ dependencies = [
 "proxy-wasm",
 "rand",
 "serde",
+ "serde_json",
 "serde_yaml",
 "thiserror",
 "tiktoken-rs",
--- a/crates/llm_gateway/src/llm_filter_context.rs
+++ b/crates/llm_gateway/src/llm_filter_context.rs
@ -1,4 +1,4 @@
-use crate::llm_stream_context::LlmGatewayStreamContext;
+use crate::stream_context::StreamContext;
 use common::configuration::Configuration;
 use common::http::Client;
 use common::llm_providers::LlmProviders;
@ -28,19 +28,19 @@ impl WasmMetrics {
 }

 #[derive(Debug)]
-pub struct FilterCallContext {}
+pub struct CallContext {}

 #[derive(Debug)]
-pub struct LlmGatewayFilterContext {
+pub struct FilterContext {
    metrics: Rc<WasmMetrics>,
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
-    callouts: RefCell<HashMap<u32, FilterCallContext>>,
+    callouts: RefCell<HashMap<u32, CallContext>>,
    llm_providers: Option<Rc<LlmProviders>>,
 }

-impl LlmGatewayFilterContext {
-    pub fn new() -> LlmGatewayFilterContext {
-        LlmGatewayFilterContext {
+impl FilterContext {
+    pub fn new() -> FilterContext {
+        FilterContext {
            callouts: RefCell::new(HashMap::new()),
            metrics: Rc::new(WasmMetrics::new()),
            llm_providers: None,
@ -48,8 +48,8 @@ impl LlmGatewayFilterContext {
    }
 }

-impl Client for LlmGatewayFilterContext {
-    type CallContext = FilterCallContext;
+impl Client for FilterContext {
+    type CallContext = CallContext;

    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
        &self.callouts
@ -60,10 +60,10 @@ impl Client for LlmGatewayFilterContext {
    }
 }

-impl Context for LlmGatewayFilterContext {}
+impl Context for FilterContext {}

 // RootContext allows the Rust code to reach into the Envoy Config
-impl RootContext for LlmGatewayFilterContext {
+impl RootContext for FilterContext {
    fn on_configure(&mut self, _: usize) -> bool {
        let config_bytes = self
            .get_plugin_configuration()
@ -90,8 +90,7 @@ impl RootContext for LlmGatewayFilterContext {
            context_id
        );

-        // No StreamContext can be created until the Embedding Store is fully initialized.
-        Some(Box::new(LlmGatewayStreamContext::new(
+        Some(Box::new(StreamContext::new(
            context_id,
            Rc::clone(&self.metrics),
            Rc::clone(
--- a/crates/llm_gateway/src/lib.rs
+++ b/crates/llm_gateway/src/lib.rs
@ -1,13 +1,13 @@
-use llm_filter_context::LlmGatewayFilterContext;
+use filter_context::FilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;

-mod llm_filter_context;
-mod llm_stream_context;
+mod filter_context;
+mod stream_context;

 proxy_wasm::main! {{
    proxy_wasm::set_log_level(LogLevel::Trace);
    proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
-        Box::new(LlmGatewayFilterContext::new())
+        Box::new(FilterContext::new())
    });
 }}
--- a/crates/llm_gateway/src/llm_stream_context.rs
+++ b/crates/llm_gateway/src/llm_stream_context.rs
@ -1,4 +1,4 @@
-use crate::llm_filter_context::WasmMetrics;
+use crate::filter_context::WasmMetrics;
 use common::common_types::open_ai::{
    ArchState, ChatCompletionChunkResponse, ChatCompletionsRequest, ChatCompletionsResponse,
    Message, ToolCall, ToolCallState,
@ -8,6 +8,7 @@ use common::consts::{
    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, CHAT_COMPLETIONS_PATH,
    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, USER_ROLE,
 };
+use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::ratelimit::Header;
 use common::{ratelimit, routing, tokenizer};
@ -22,25 +23,12 @@ use std::rc::Rc;

 use common::stats::IncrementingMetric;

-#[derive(thiserror::Error, Debug)]
-pub enum ServerError {
-    #[error(transparent)]
-    Deserialization(serde_json::Error),
-    #[error("{0}")]
-    LogicError(String),
-    #[error(transparent)]
-    ExceededRatelimit(ratelimit::Error),
-    #[error("{why}")]
-    BadRequest { why: String },
-}
-
-pub struct LlmGatewayStreamContext {
+pub struct StreamContext {
    context_id: u32,
    metrics: Rc<WasmMetrics>,
    tool_calls: Option<Vec<ToolCall>>,
    tool_call_response: Option<String>,
    arch_state: Option<Vec<ArchState>>,
-    request_body_size: usize,
    ratelimit_selector: Option<Header>,
    streaming_response: bool,
    user_prompt: Option<Message>,
@ -52,17 +40,15 @@ pub struct LlmGatewayStreamContext {
    request_id: Option<String>,
 }

-impl LlmGatewayStreamContext {
-    #[allow(clippy::too_many_arguments)]
+impl StreamContext {
    pub fn new(context_id: u32, metrics: Rc<WasmMetrics>, llm_providers: Rc<LlmProviders>) -> Self {
-        LlmGatewayStreamContext {
+        StreamContext {
            context_id,
            metrics,
            chat_completions_request: None,
            tool_calls: None,
            tool_call_response: None,
            arch_state: None,
-            request_body_size: 0,
            ratelimit_selector: None,
            streaming_response: false,
            user_prompt: None,
@ -160,7 +146,7 @@ impl LlmGatewayStreamContext {
 }

 // HttpContext is the trait that allows the Rust code to interact with HTTP objects.
-impl HttpContext for LlmGatewayStreamContext {
+impl HttpContext for StreamContext {
    // Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
    // the lifecycle of the http request and response.
    fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
@ -198,8 +184,6 @@ impl HttpContext for LlmGatewayStreamContext {
            return Action::Continue;
        }

-        self.request_body_size = body_size;
-
        // Deserialize body into spec.
        // Currently OpenAI API.
        let mut deserialized_body: ChatCompletionsRequest =
@ -225,7 +209,6 @@ impl HttpContext for LlmGatewayStreamContext {
                    return Action::Pause;
                }
            };
-        self.is_chat_completions_request = true;

        // remove metadata from the request body
        deserialized_body.metadata = None;
@ -417,4 +400,4 @@ impl HttpContext for LlmGatewayStreamContext {
    }
 }

-impl Context for LlmGatewayStreamContext {}
+impl Context for StreamContext {}
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@ -7,7 +7,7 @@ use serial_test::serial;
 use std::path::Path;

 fn wasm_module() -> String {
-    let wasm_file = Path::new("target/wasm32-wasi/release/llm_gateway.wasm");
+    let wasm_file = Path::new("../target/wasm32-wasi/release/llm_gateway.wasm");
    assert!(
        wasm_file.exists(),
        "Run `cargo build --release --target=wasm32-wasi` first"
--- a/crates/prompt_gateway/Cargo.lock
+++ b/crates/prompt_gateway/Cargo.lock
@ -228,6 +228,7 @@ dependencies = [
 "proxy-wasm",
 "rand",
 "serde",
+ "serde_json",
 "serde_yaml",
 "thiserror",
 "tiktoken-rs",
--- a/crates/prompt_gateway/src/prompt_filter_context.rs
+++ b/crates/prompt_gateway/src/prompt_filter_context.rs
@ -1,6 +1,6 @@
-use crate::prompt_stream_context::PromptStreamContext;
+use crate::stream_context::StreamContext;
 use common::common_types::EmbeddingType;
-use common::configuration::{Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget};
+use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget};
 use common::consts::ARCH_INTERNAL_CLUSTER_NAME;
 use common::consts::ARCH_UPSTREAM_HOST_HEADER;
 use common::consts::DEFAULT_EMBEDDING_MODEL;
@ -10,7 +10,6 @@ use common::embeddings::{
 };
 use common::http::CallArgs;
 use common::http::Client;
-use common::llm_providers::LlmProviders;
 use common::stats::Gauge;
 use common::stats::IncrementingMetric;
 use log::debug;
@ -45,31 +44,27 @@ pub struct FilterCallContext {
 }

 #[derive(Debug)]
-pub struct PromptGatewayFilterContext {
+pub struct FilterContext {
    metrics: Rc<WasmMetrics>,
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
    callouts: RefCell<HashMap<u32, FilterCallContext>>,
    overrides: Rc<Option<Overrides>>,
    system_prompt: Rc<Option<String>>,
    prompt_targets: Rc<HashMap<String, PromptTarget>>,
-    mode: GatewayMode,
    prompt_guards: Rc<PromptGuards>,
-    llm_providers: Option<Rc<LlmProviders>>,
    embeddings_store: Option<Rc<EmbeddingsStore>>,
    temp_embeddings_store: EmbeddingsStore,
 }

-impl PromptGatewayFilterContext {
-    pub fn new() -> PromptGatewayFilterContext {
-        PromptGatewayFilterContext {
+impl FilterContext {
+    pub fn new() -> FilterContext {
+        FilterContext {
            callouts: RefCell::new(HashMap::new()),
            metrics: Rc::new(WasmMetrics::new()),
            system_prompt: Rc::new(None),
            prompt_targets: Rc::new(HashMap::new()),
            overrides: Rc::new(None),
            prompt_guards: Rc::new(PromptGuards::default()),
-            mode: GatewayMode::Prompt,
-            llm_providers: None,
            embeddings_store: Some(Rc::new(HashMap::new())),
            temp_embeddings_store: HashMap::new(),
        }
@ -117,7 +112,7 @@ impl PromptGatewayFilterContext {
            Duration::from_secs(60),
        );

-        let call_context = crate::prompt_filter_context::FilterCallContext {
+        let call_context = crate::filter_context::FilterCallContext {
            prompt_target_name: String::from(prompt_target_name),
            embedding_type,
        };
@ -194,7 +189,7 @@ impl PromptGatewayFilterContext {
    }
 }

-impl Client for PromptGatewayFilterContext {
+impl Client for FilterContext {
    type CallContext = FilterCallContext;

    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
@ -206,7 +201,7 @@ impl Client for PromptGatewayFilterContext {
    }
 }

-impl Context for PromptGatewayFilterContext {
+impl Context for FilterContext {
    fn on_http_call_response(
        &mut self,
        token_id: u32,
@ -235,7 +230,7 @@ impl Context for PromptGatewayFilterContext {
 }

 // RootContext allows the Rust code to reach into the Envoy Config
-impl RootContext for PromptGatewayFilterContext {
+impl RootContext for FilterContext {
    fn on_configure(&mut self, _: usize) -> bool {
        let config_bytes = self
            .get_plugin_configuration()
@ -254,17 +249,11 @@ impl RootContext for PromptGatewayFilterContext {
        }
        self.system_prompt = Rc::new(config.system_prompt);
        self.prompt_targets = Rc::new(prompt_targets);
-        self.mode = config.mode.unwrap_or_default();

        if let Some(prompt_guards) = config.prompt_guards {
            self.prompt_guards = Rc::new(prompt_guards)
        }

-        match config.llm_providers.try_into() {
-            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
-            Err(err) => panic!("{err}"),
-        }
-
        true
    }

@ -274,12 +263,11 @@ impl RootContext for PromptGatewayFilterContext {
            context_id
        );

-        // No StreamContext can be created until the Embedding Store is fully initialized.
-        let embedding_store = match self.mode {
-            GatewayMode::Llm => None,
-            GatewayMode::Prompt => Some(Rc::clone(self.embeddings_store.as_ref().unwrap())),
+        let embedding_store = match self.embeddings_store.as_ref() {
+            None => return None,
+            Some(store) => Some(Rc::clone(store)),
        };
-        Some(Box::new(PromptStreamContext::new(
+        Some(Box::new(StreamContext::new(
            context_id,
            Rc::clone(&self.metrics),
            Rc::clone(&self.system_prompt),
@ -300,11 +288,8 @@ impl RootContext for PromptGatewayFilterContext {
    }

    fn on_tick(&mut self) {
-        debug!("starting up arch filter in mode: {:?}", self.mode);
-        if self.mode == GatewayMode::Prompt {
-            self.process_prompt_targets();
-        }
-
+        debug!("starting up arch filter in mode: prompt gateway mode");
+        self.process_prompt_targets();
        self.set_tick_period(Duration::from_secs(0));
    }
 }
--- a/crates/prompt_gateway/src/lib.rs
+++ b/crates/prompt_gateway/src/lib.rs
@ -1,13 +1,13 @@
-use prompt_filter_context::PromptGatewayFilterContext;
+use filter_context::FilterContext;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;

-mod prompt_filter_context;
-mod prompt_stream_context;
+mod filter_context;
+mod stream_context;

 proxy_wasm::main! {{
    proxy_wasm::set_log_level(LogLevel::Trace);
    proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
-        Box::new(PromptGatewayFilterContext::new())
+        Box::new(FilterContext::new())
    });
 }}
--- a/crates/prompt_gateway/src/prompt_stream_context.rs
+++ b/crates/prompt_gateway/src/prompt_stream_context.rs
@ -1,4 +1,4 @@
-use crate::prompt_filter_context::{EmbeddingsStore, WasmMetrics};
+use crate::filter_context::{EmbeddingsStore, WasmMetrics};
 use acap::cos;
 use common::common_types::open_ai::{
    ArchState, ChatCompletionTool, ChatCompletionsRequest, ChatCompletionsResponse, Choice,
@ -21,7 +21,8 @@ use common::consts::{
 use common::embeddings::{
    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
 };
-use common::http::{CallArgs, Client, ClientError};
+use common::errors::ClientError;
+use common::http::{CallArgs, Client};
 use common::stats::Gauge;
 use http::StatusCode;
 use log::{debug, info, warn};
@ -83,7 +84,7 @@ pub enum ServerError {
    NoMessagesFound { why: String },
 }

-pub struct PromptStreamContext {
+pub struct StreamContext {
    context_id: u32,
    metrics: Rc<WasmMetrics>,
    system_prompt: Rc<Option<String>>,
@ -104,8 +105,7 @@ pub struct PromptStreamContext {
    request_id: Option<String>,
 }

-impl PromptStreamContext {
-    #[allow(clippy::too_many_arguments)]
+impl StreamContext {
    pub fn new(
        context_id: u32,
        metrics: Rc<WasmMetrics>,
@ -115,7 +115,7 @@ impl PromptStreamContext {
        overrides: Rc<Option<Overrides>>,
        embeddings_store: Option<Rc<EmbeddingsStore>>,
    ) -> Self {
-        PromptStreamContext {
+        StreamContext {
            context_id,
            metrics,
            system_prompt,
@ -1049,7 +1049,7 @@ impl PromptStreamContext {
 }

 // HttpContext is the trait that allows the Rust code to interact with HTTP objects.
-impl HttpContext for PromptStreamContext {
+impl HttpContext for StreamContext {
    // Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
    // the lifecycle of the http request and response.
    fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
@ -1112,7 +1112,6 @@ impl HttpContext for PromptStreamContext {
                    return Action::Pause;
                }
            };
-        self.is_chat_completions_request = true;

        self.arch_state = match deserialized_body.metadata {
            Some(ref metadata) => {
@ -1363,7 +1362,7 @@ impl HttpContext for PromptStreamContext {
    }
 }

-impl Context for PromptStreamContext {
+impl Context for StreamContext {
    fn on_http_call_response(
        &mut self,
        token_id: u32,
@ -1409,7 +1408,7 @@ impl Context for PromptStreamContext {
    }
 }

-impl Client for PromptStreamContext {
+impl Client for StreamContext {
    type CallContext = StreamCallContext;

    fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -17,7 +17,7 @@ use std::collections::HashMap;
 use std::path::Path;

 fn wasm_module() -> String {
-    let wasm_file = Path::new("target/wasm32-wasi/release/prompt_gateway.wasm");
+    let wasm_file = Path::new("../target/wasm32-wasi/release/prompt_gateway.wasm");
    assert!(
        wasm_file.exists(),
        "Run `cargo build --release --target=wasm32-wasi` first"