use req/resp from hermesllm in llm gateway

2026-06-17 15:25:17 +02:00 · 2025-06-03 15:57:30 -07:00 · 2025-06-03 15:57:30 -07:00 · b0c1e97dc5
commit b0c1e97dc5
parent f10e0fcece
6 changed files with 35 additions and 14 deletions
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -773,7 +773,7 @@ static_resources:
              - endpoint:
                  address:
                    socket_address:
-                      address: 0.0.0.0
+                      address: host.docker.internal
                      port_value: 9091
                  hostname: localhost

--- a/arch/tools/cli/docker_cli.py
+++ b/arch/tools/cli/docker_cli.py
@ -64,6 +64,8 @@ def docker_start_archgw_detached(
        item for volume in volume_mappings for item in ("-v", volume)
    ]

+    print("using custom release path")
+
    options = [
        "docker",
        "run",
@ -76,6 +78,7 @@ def docker_start_archgw_detached(
        "--add-host",
        "host.docker.internal:host-gateway",
        ARCHGW_DOCKER_IMAGE,
+        "/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
    ]

    result = subprocess.run(options, capture_output=True, text=True, check=False)
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
@ -1615,6 +1615,7 @@ dependencies = [
 "common",
 "derivative",
 "governor",
+ "hermesllm",
 "http 1.1.0",
 "log",
 "md5",
--- a/crates/hermesllm/src/providers/openai/types.rs
+++ b/crates/hermesllm/src/providers/openai/types.rs
@ -57,6 +57,12 @@ pub struct Message {
    pub content: Option<ContentType>,
 }

+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StreamOptions {
+  pub include_usage: bool,
+}
+
 #[skip_serializing_none]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChatCompletionsRequest {
@ -70,6 +76,7 @@ pub struct ChatCompletionsRequest {
    pub stop: Option<Vec<String>>,
    pub presence_penalty: Option<f32>,
    pub frequency_penalty: Option<f32>,
+    pub stream_options: Option<StreamOptions>,
 }

 impl Default for ChatCompletionsRequest {
@ -85,6 +92,7 @@ impl Default for ChatCompletionsRequest {
            stop: None,
            presence_penalty: None,
            frequency_penalty: None,
+            stream_options: None,
        }
    }
 }
@ -110,9 +118,9 @@ pub struct Choice {
 #[skip_serializing_none]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct Usage {
-    pub prompt_tokens: u32,
-    pub completion_tokens: u32,
-    pub total_tokens: u32,
+    pub prompt_tokens: usize,
+    pub completion_tokens: usize,
+    pub total_tokens: usize,
 }

 #[derive(Debug, Clone)]
@ -127,6 +135,7 @@ pub struct OpenAIRequestBuilder {
    stop: Option<Vec<String>>,
    presence_penalty: Option<f32>,
    frequency_penalty: Option<f32>,
+    stream_options: Option<StreamOptions>,
 }

 impl OpenAIRequestBuilder {
@ -142,6 +151,7 @@ impl OpenAIRequestBuilder {
            stop: None,
            presence_penalty: None,
            frequency_penalty: None,
+            stream_options: None,
        }
    }

@ -185,6 +195,12 @@ impl OpenAIRequestBuilder {
        self
    }

+    pub fn stream_options(mut self, include_usage: bool) -> Self {
+        self.stream = Some(true);
+        self.stream_options = Some(StreamOptions { include_usage });
+        self
+    }
+
    pub fn build(self) -> Result<ChatCompletionsRequest, &'static str> {
        let request = ChatCompletionsRequest {
            model: self.model,
@ -197,6 +213,7 @@ impl OpenAIRequestBuilder {
            stop: self.stop,
            presence_penalty: self.presence_penalty,
            frequency_penalty: self.frequency_penalty,
+            stream_options: self.stream_options,
        };
        Ok(request)
    }
--- a/crates/llm_gateway/Cargo.toml
+++ b/crates/llm_gateway/Cargo.toml
@ -22,6 +22,7 @@ rand = "0.8.5"
 thiserror = "1.0.64"
 derivative = "2.2.0"
 sha2 = "0.10.8"
+hermesllm = { version = "0.1.0", path = "../hermesllm" }

 [dev-dependencies]
 proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -1,8 +1,5 @@
 use crate::metrics::Metrics;
-use common::api::open_ai::{
-    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
-    ContentType, Message, StreamOptions,
-};
+use common::api::open_ai::ChatCompletionStreamResponseServerEvents;
 use common::configuration::{LlmProvider, LlmProviderType, Overrides};
 use common::consts::{
    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
@ -14,6 +11,10 @@ use common::ratelimit::Header;
 use common::stats::{IncrementingMetric, RecordingMetric};
 use common::tracing::{Event, Span, TraceData, Traceparent};
 use common::{ratelimit, routing, tokenizer};
+use hermesllm::providers::openai::types::ChatCompletionsRequest;
+use hermesllm::providers::openai::types::{
+    ChatCompletionsResponse, ContentType, Message, StreamOptions,
+};
 use http::StatusCode;
 use log::{debug, info, warn};
 use proxy_wasm::hostcalls::get_current_time;
@ -302,10 +303,6 @@ impl HttpContext for StreamContext {
                }
            };

-        for message in deserialized_body.messages.iter_mut() {
-            message.model = None;
-        }
-
        self.user_message = deserialized_body
            .messages
            .iter()
@ -355,10 +352,12 @@ impl HttpContext for StreamContext {
            chat_completion_request_str
        );

-        if deserialized_body.stream {
+        if deserialized_body.stream.unwrap_or_default() {
            self.streaming_response = true;
        }
-        if deserialized_body.stream && deserialized_body.stream_options.is_none() {
+        if deserialized_body.stream.unwrap_or_default()
+            && deserialized_body.stream_options.is_none()
+        {
            deserialized_body.stream_options = Some(StreamOptions {
                include_usage: true,
            });