use req/resp from hermesllm in llm gateway

This commit is contained in:
Adil Hafeez 2025-06-03 15:57:30 -07:00
parent f10e0fcece
commit b0c1e97dc5
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
6 changed files with 35 additions and 14 deletions

View file

@ -773,7 +773,7 @@ static_resources:
- endpoint:
address:
socket_address:
address: 0.0.0.0
address: host.docker.internal
port_value: 9091
hostname: localhost

View file

@ -64,6 +64,8 @@ def docker_start_archgw_detached(
item for volume in volume_mappings for item in ("-v", volume)
]
print("using custom release path")
options = [
"docker",
"run",
@ -76,6 +78,7 @@ def docker_start_archgw_detached(
"--add-host",
"host.docker.internal:host-gateway",
ARCHGW_DOCKER_IMAGE,
"/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
]
result = subprocess.run(options, capture_output=True, text=True, check=False)

1
crates/Cargo.lock generated
View file

@ -1615,6 +1615,7 @@ dependencies = [
"common",
"derivative",
"governor",
"hermesllm",
"http 1.1.0",
"log",
"md5",

View file

@ -57,6 +57,12 @@ pub struct Message {
pub content: Option<ContentType>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamOptions {
pub include_usage: bool,
}
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionsRequest {
@ -70,6 +76,7 @@ pub struct ChatCompletionsRequest {
pub stop: Option<Vec<String>>,
pub presence_penalty: Option<f32>,
pub frequency_penalty: Option<f32>,
pub stream_options: Option<StreamOptions>,
}
impl Default for ChatCompletionsRequest {
@ -85,6 +92,7 @@ impl Default for ChatCompletionsRequest {
stop: None,
presence_penalty: None,
frequency_penalty: None,
stream_options: None,
}
}
}
@ -110,9 +118,9 @@ pub struct Choice {
#[skip_serializing_none]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Usage {
pub prompt_tokens: u32,
pub completion_tokens: u32,
pub total_tokens: u32,
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
}
#[derive(Debug, Clone)]
@ -127,6 +135,7 @@ pub struct OpenAIRequestBuilder {
stop: Option<Vec<String>>,
presence_penalty: Option<f32>,
frequency_penalty: Option<f32>,
stream_options: Option<StreamOptions>,
}
impl OpenAIRequestBuilder {
@ -142,6 +151,7 @@ impl OpenAIRequestBuilder {
stop: None,
presence_penalty: None,
frequency_penalty: None,
stream_options: None,
}
}
@ -185,6 +195,12 @@ impl OpenAIRequestBuilder {
self
}
pub fn stream_options(mut self, include_usage: bool) -> Self {
self.stream = Some(true);
self.stream_options = Some(StreamOptions { include_usage });
self
}
pub fn build(self) -> Result<ChatCompletionsRequest, &'static str> {
let request = ChatCompletionsRequest {
model: self.model,
@ -197,6 +213,7 @@ impl OpenAIRequestBuilder {
stop: self.stop,
presence_penalty: self.presence_penalty,
frequency_penalty: self.frequency_penalty,
stream_options: self.stream_options,
};
Ok(request)
}

View file

@ -22,6 +22,7 @@ rand = "0.8.5"
thiserror = "1.0.64"
derivative = "2.2.0"
sha2 = "0.10.8"
hermesllm = { version = "0.1.0", path = "../hermesllm" }
[dev-dependencies]
proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }

View file

@ -1,8 +1,5 @@
use crate::metrics::Metrics;
use common::api::open_ai::{
ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
ContentType, Message, StreamOptions,
};
use common::api::open_ai::ChatCompletionStreamResponseServerEvents;
use common::configuration::{LlmProvider, LlmProviderType, Overrides};
use common::consts::{
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
@ -14,6 +11,10 @@ use common::ratelimit::Header;
use common::stats::{IncrementingMetric, RecordingMetric};
use common::tracing::{Event, Span, TraceData, Traceparent};
use common::{ratelimit, routing, tokenizer};
use hermesllm::providers::openai::types::ChatCompletionsRequest;
use hermesllm::providers::openai::types::{
ChatCompletionsResponse, ContentType, Message, StreamOptions,
};
use http::StatusCode;
use log::{debug, info, warn};
use proxy_wasm::hostcalls::get_current_time;
@ -302,10 +303,6 @@ impl HttpContext for StreamContext {
}
};
for message in deserialized_body.messages.iter_mut() {
message.model = None;
}
self.user_message = deserialized_body
.messages
.iter()
@ -355,10 +352,12 @@ impl HttpContext for StreamContext {
chat_completion_request_str
);
if deserialized_body.stream {
if deserialized_body.stream.unwrap_or_default() {
self.streaming_response = true;
}
if deserialized_body.stream && deserialized_body.stream_options.is_none() {
if deserialized_body.stream.unwrap_or_default()
&& deserialized_body.stream_options.is_none()
{
deserialized_body.stream_options = Some(StreamOptions {
include_usage: true,
});