Introduce hermesllm library to handle llm message translation (#501)

This commit is contained in:
Adil Hafeez 2025-06-10 12:53:27 -07:00 committed by GitHub
parent 96b583c819
commit 6c53510f49
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 1693 additions and 690 deletions

View file

@ -329,7 +329,7 @@ $ archgw up --service archgw --foreground
...
```
Log level can be changed to debug to get more details. To enable debug logs edit (Dockerfile)[arch/Dockerfile], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
Log level can be changed to debug to get more details. To enable debug logs edit (supervisord.conf)[arch/supervisord.conf], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
```
# make sure you are at the root of the repo

View file

@ -9,7 +9,7 @@ stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0
[program:envoy]
command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml 2>&1 | tee /var/log//envoy.log"
command=/bin/sh -c "python /app/config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log//envoy.log"
stdout_logfile=/dev/stdout
redirect_stderr=true
stdout_logfile_maxbytes=0

1147
crates/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,3 @@
[workspace]
resolver = "2"
members = ["llm_gateway", "prompt_gateway", "common", "brightstaff"]
members = ["llm_gateway", "prompt_gateway", "common", "brightstaff", "hermesllm"]

View file

@ -10,6 +10,7 @@ eventsource-client = "0.15.0"
eventsource-stream = "0.2.3"
futures = "0.3.31"
futures-util = "0.3.31"
hermesllm = { version = "0.1.0", path = "../hermesllm" }
http-body = "1.0.1"
http-body-util = "0.1.3"
hyper = { version = "1.6.0", features = ["full"] }

View file

@ -1,14 +1,13 @@
use std::sync::Arc;
use bytes::Bytes;
use common::api::open_ai::ChatCompletionsRequest;
use common::consts::ARCH_PROVIDER_HINT_HEADER;
use hermesllm::providers::openai::types::ChatCompletionsRequest;
use http_body_util::combinators::BoxBody;
use http_body_util::{BodyExt, Full, StreamBody};
use hyper::body::Frame;
use hyper::header::{self};
use hyper::{Request, Response, StatusCode};
use serde_json::Value;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tokio_stream::StreamExt;
@ -32,13 +31,15 @@ pub async fn chat_completions(
let chat_request_bytes = request.collect().await?.to_bytes();
let chat_completion_request: ChatCompletionsRequest =
match serde_json::from_slice(&chat_request_bytes) {
match ChatCompletionsRequest::try_from(chat_request_bytes.as_ref()) {
Ok(request) => request,
Err(err) => {
let v: Value = serde_json::from_slice(&chat_request_bytes).unwrap();
warn!(
"arch-router request body string: {}",
String::from_utf8_lossy(&chat_request_bytes)
);
let err_msg = format!("Failed to parse request body: {}", err);
warn!("{}", err_msg);
warn!("arch-router request body: {}", v.to_string());
let mut bad_request = Response::new(full(err_msg));
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
return Ok(bad_request);

View file

@ -1,6 +1,6 @@
use bytes::Bytes;
use common::api::open_ai::Models;
use common::configuration::LlmProvider;
use common::configuration::{IntoModels, LlmProvider};
use hermesllm::providers::openai::types::Models;
use http_body_util::{combinators::BoxBody, BodyExt, Full};
use hyper::{Response, StatusCode};
use serde_json;
@ -11,7 +11,7 @@ pub async fn list_models(
) -> Response<BoxBody<Bytes, hyper::Error>> {
let prov = llm_providers.clone();
let providers = (*prov).clone();
let openai_models = Models::from(providers);
let openai_models: Models = providers.into_models();
match serde_json::to_string(&openai_models) {
Ok(json) => {

View file

@ -1,10 +1,10 @@
use std::sync::Arc;
use common::{
api::open_ai::{ChatCompletionsResponse, ContentType, Message},
configuration::{LlmProvider, LlmRoute},
consts::ARCH_PROVIDER_HINT_HEADER,
};
use hermesllm::providers::openai::types::{ChatCompletionsResponse, ContentType, Message};
use hyper::header;
use thiserror::Error;
use tracing::{debug, info, warn};
@ -136,6 +136,11 @@ impl RouterService {
}
};
if chat_completion_response.choices.is_empty() {
warn!("No choices in router response: {}", body);
return Ok(None);
}
if let Some(ContentType::Text(content)) =
&chat_completion_response.choices[0].message.content
{

View file

@ -1,4 +1,4 @@
use common::api::open_ai::{ChatCompletionsRequest, Message};
use hermesllm::providers::openai::types::{ChatCompletionsRequest, Message};
use thiserror::Error;
#[derive(Debug, Error)]

View file

@ -1,8 +1,8 @@
use common::{
api::open_ai::{ChatCompletionsRequest, ContentType, Message},
configuration::LlmRoute,
consts::{SYSTEM_ROLE, TOOL_ROLE, USER_ROLE},
};
use hermesllm::providers::openai::types::{ChatCompletionsRequest, ContentType, Message};
use serde::{Deserialize, Serialize};
use tracing::{debug, warn};
@ -121,11 +121,13 @@ impl RouterModel for RouterModelV1 {
.iter()
.rev()
.map(|message| {
Message::new(
message.role.clone(),
Message {
role: message.role.clone(),
// we can unwrap here because we have already filtered out messages without content
message.content.as_ref().unwrap().to_string(),
)
content: Some(ContentType::Text(
message.content.as_ref().unwrap().to_string(),
)),
}
})
.collect::<Vec<Message>>();
@ -141,14 +143,8 @@ impl RouterModel for RouterModelV1 {
messages: vec![Message {
content: Some(ContentType::Text(messages_content)),
role: USER_ROLE.to_string(),
model: None,
tool_calls: None,
tool_call_id: None,
}],
tools: None,
stream: false,
stream_options: None,
metadata: None,
..Default::default()
}
}

View file

@ -18,6 +18,7 @@ serde_json = "1.0"
hex = "0.4.3"
urlencoding = "2.1.3"
url = "2.5.4"
hermesllm = { version = "0.1.0", path = "../hermesllm" }
[dev-dependencies]
pretty_assertions = "1.4.1"

View file

@ -1,3 +1,4 @@
use hermesllm::providers::openai::types::{ModelDetail, ModelObject, Models};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt::Display;
@ -206,6 +207,29 @@ pub struct LlmProvider {
pub usage: Option<String>,
}
pub trait IntoModels {
fn into_models(self) -> Models;
}
impl IntoModels for Vec<LlmProvider> {
fn into_models(self) -> Models {
let data = self
.iter()
.map(|provider| ModelDetail {
id: provider.name.clone(),
object: "model".to_string(),
created: 0,
owned_by: "system".to_string(),
})
.collect();
Models {
object: ModelObject::List,
data,
}
}
}
impl Default for LlmProvider {
fn default() -> Self {
Self {

View file

@ -1,6 +1,7 @@
use proxy_wasm::types::Status;
use crate::{api::open_ai::ChatCompletionChunkResponseError, ratelimit};
use hermesllm::providers::openai::types::OpenAIError;
#[derive(thiserror::Error, Debug)]
pub enum ClientError {
@ -39,4 +40,6 @@ pub enum ServerError {
BadRequest { why: String },
#[error("error in streaming response")]
Streaming(#[from] ChatCompletionChunkResponseError),
#[error("error parsing openai message: {0}")]
OpenAIPError(#[from] OpenAIError),
}

View file

@ -14,7 +14,7 @@ pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
);
"gpt-4"
}
true => model_name
true => model_name,
};
// Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?

View file

@ -0,0 +1,10 @@
[package]
name = "hermesllm"
version = "0.1.0"
edition = "2021"
[dependencies]
serde = {version = "1.0.219", features = ["derive"]}
serde_json = "1.0.140"
serde_with = "3.12.0"
thiserror = "2.0.12"

View file

@ -0,0 +1,63 @@
# hermesllm
A Rust library for translating LLM (Large Language Model) API requests and responses between Mistral, Groq, Gemini, Deepseek, OpenAI, and other provider-compliant formats.
## Features
- Unified types for chat completions and model metadata across multiple LLM providers
- Builder-pattern API for constructing requests in an idiomatic Rust style
- Easy conversion between provider formats
- Streaming and non-streaming response support
## Supported Providers
- Mistral
- Deepseek
- Groq
- Gemini
- OpenAI
- Claude
- Github
## Installation
Add the following to your `Cargo.toml`:
```toml
[dependencies]
hermesllm = { git = "https://github.com/katanemo/archgw", subdir = "crates/hermesllm" }
```
_Replace the path with the appropriate location if using as a workspace member or published crate._
## Usage
Construct a chat completion request using the builder pattern:
```rust
use hermesllm::Provider;
use hermesllm::providers::openai::types::ChatCompletionsRequest;
let request = ChatCompletionsRequest::builder("gpt-3.5-turbo", vec![Message::new("Hi".to_string())])
.build()
.expect("Failed to build OpenAIRequest");
// Convert to bytes for a specific provider
let bytes = request.to_bytes(Provider::OpenAI)?;
```
## API Overview
- `Provider`: Enum listing all supported LLM providers.
- `ChatCompletionsRequest`: Builder-pattern struct for creating chat completion requests.
- `ChatCompletionsResponse`: Struct for parsing responses.
- Streaming support via `SseChatCompletionIter`.
- Error handling via `OpenAIError`.
## Contributing
Contributions are welcome! Please open issues or pull requests for bug fixes, new features, or provider integrations.
## License
This project is licensed under the terms of the [MIT License](../LICENSE).

View file

@ -0,0 +1,79 @@
//! hermesllm: A library for translating LLM API requests and responses
//! between Mistral, Grok, Gemini, and OpenAI-compliant formats.
use std::fmt::Display;
pub mod providers;
pub enum Provider {
Arch,
Mistral,
Deepseek,
Groq,
Gemini,
OpenAI,
Claude,
Github,
}
impl From<&str> for Provider {
fn from(value: &str) -> Self {
match value.to_lowercase().as_str() {
"arch" => Provider::Arch,
"mistral" => Provider::Mistral,
"deepseek" => Provider::Deepseek,
"groq" => Provider::Groq,
"gemini" => Provider::Gemini,
"openai" => Provider::OpenAI,
"claude" => Provider::Claude,
"github" => Provider::Github,
_ => panic!("Unknown provider: {}", value),
}
}
}
impl Display for Provider {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Provider::Arch => write!(f, "Arch"),
Provider::Mistral => write!(f, "Mistral"),
Provider::Deepseek => write!(f, "Deepseek"),
Provider::Groq => write!(f, "Groq"),
Provider::Gemini => write!(f, "Gemini"),
Provider::OpenAI => write!(f, "OpenAI"),
Provider::Claude => write!(f, "Claude"),
Provider::Github => write!(f, "Github"),
}
}
}
#[cfg(test)]
mod tests {
use crate::providers::openai::types::{ChatCompletionsRequest, Message};
#[test]
fn openai_builder() {
let request =
ChatCompletionsRequest::builder("gpt-3.5-turbo", vec![Message::new("Hi".to_string())])
.temperature(0.7)
.top_p(0.9)
.n(1)
.max_tokens(100)
.stream(false)
.stop(vec!["\n".to_string()])
.presence_penalty(0.0)
.frequency_penalty(0.0)
.build()
.expect("Failed to build OpenAIRequest");
assert_eq!(request.model, "gpt-3.5-turbo");
assert_eq!(request.temperature, Some(0.7));
assert_eq!(request.top_p, Some(0.9));
assert_eq!(request.n, Some(1));
assert_eq!(request.max_tokens, Some(100));
assert_eq!(request.stream, Some(false));
assert_eq!(request.stop, Some(vec!["\n".to_string()]));
assert_eq!(request.presence_penalty, Some(0.0));
assert_eq!(request.frequency_penalty, Some(0.0));
}
}

View file

@ -0,0 +1 @@
pub mod types;

View file

@ -0,0 +1,19 @@
use crate::providers::openai::types::{ChatCompletionsRequest, ChatCompletionsResponse};
pub use crate::providers::openai::types::{Choice, Message, Usage};
use serde::{Deserialize, Serialize};
use serde_with::skip_serializing_none;
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeepSeekRequest {
#[serde(flatten)]
pub base: ChatCompletionsRequest,
}
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeepSeekResponse {
#[serde(flatten)]
pub base: ChatCompletionsResponse,
}

View file

@ -0,0 +1,2 @@
pub mod deepseek;
pub mod openai;

View file

@ -0,0 +1,113 @@
use serde_json::Value;
use crate::providers::openai::types::{ChatCompletionsRequest, Message, StreamOptions};
#[derive(Debug, Clone)]
pub struct OpenAIRequestBuilder {
model: String,
messages: Vec<Message>,
temperature: Option<f32>,
top_p: Option<f32>,
n: Option<u32>,
max_tokens: Option<u32>,
stream: Option<bool>,
stop: Option<Vec<String>>,
presence_penalty: Option<f32>,
frequency_penalty: Option<f32>,
stream_options: Option<StreamOptions>,
tools: Option<Vec<Value>>,
}
impl OpenAIRequestBuilder {
pub fn new(model: impl Into<String>, messages: Vec<Message>) -> Self {
Self {
model: model.into(),
messages,
temperature: None,
top_p: None,
n: None,
max_tokens: None,
stream: None,
stop: None,
presence_penalty: None,
frequency_penalty: None,
stream_options: None,
tools: None,
}
}
pub fn temperature(mut self, temperature: f32) -> Self {
self.temperature = Some(temperature);
self
}
pub fn top_p(mut self, top_p: f32) -> Self {
self.top_p = Some(top_p);
self
}
pub fn n(mut self, n: u32) -> Self {
self.n = Some(n);
self
}
pub fn max_tokens(mut self, max_tokens: u32) -> Self {
self.max_tokens = Some(max_tokens);
self
}
pub fn stream(mut self, stream: bool) -> Self {
self.stream = Some(stream);
self
}
pub fn stop(mut self, stop: Vec<String>) -> Self {
self.stop = Some(stop);
self
}
pub fn presence_penalty(mut self, presence_penalty: f32) -> Self {
self.presence_penalty = Some(presence_penalty);
self
}
pub fn frequency_penalty(mut self, frequency_penalty: f32) -> Self {
self.frequency_penalty = Some(frequency_penalty);
self
}
pub fn stream_options(mut self, include_usage: bool) -> Self {
self.stream = Some(true);
self.stream_options = Some(StreamOptions { include_usage });
self
}
pub fn tools(mut self, tools: Vec<Value>) -> Self {
self.tools = Some(tools);
self
}
pub fn build(self) -> Result<ChatCompletionsRequest, &'static str> {
let request = ChatCompletionsRequest {
model: self.model,
messages: self.messages,
temperature: self.temperature,
top_p: self.top_p,
n: self.n,
max_tokens: self.max_tokens,
stream: self.stream,
stop: self.stop,
presence_penalty: self.presence_penalty,
frequency_penalty: self.frequency_penalty,
stream_options: self.stream_options,
tools: self.tools,
};
Ok(request)
}
}
impl ChatCompletionsRequest {
pub fn builder(model: impl Into<String>, messages: Vec<Message>) -> OpenAIRequestBuilder {
OpenAIRequestBuilder::new(model, messages)
}
}

View file

@ -0,0 +1,2 @@
pub mod builder;
pub mod types;

View file

@ -0,0 +1,497 @@
use std::fmt::Display;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use serde_with::skip_serializing_none;
use std::convert::TryFrom;
use std::str;
use thiserror::Error;
use crate::Provider;
#[derive(Debug, Error)]
pub enum OpenAIError {
#[error("json error: {0}")]
JsonParseError(#[from] serde_json::Error),
#[error("utf8 parsing error: {0}")]
Utf8Error(#[from] std::str::Utf8Error),
#[error("invalid streaming data err {source}, data: {data}")]
InvalidStreamingData {
source: serde_json::Error,
data: String,
},
#[error("unsupported provider: {provider}")]
UnsupportedProvider { provider: String },
}
type Result<T> = std::result::Result<T, OpenAIError>;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum MultiPartContentType {
#[serde(rename = "text")]
Text,
#[serde(rename = "image_url")]
ImageUrl,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct MultiPartContent {
pub text: Option<String>,
#[serde(rename = "type")]
pub content_type: MultiPartContentType,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
pub enum ContentType {
Text(String),
MultiPart(Vec<MultiPartContent>),
}
impl Display for ContentType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ContentType::Text(text) => write!(f, "{}", text),
ContentType::MultiPart(multi_part) => {
let text_parts: Vec<String> = multi_part
.iter()
.filter_map(|part| {
if part.content_type == MultiPartContentType::Text {
part.text.clone()
} else if part.content_type == MultiPartContentType::ImageUrl {
// skip image URLs or their data in text representation
None
} else {
panic!("Unsupported content type: {:?}", part.content_type);
}
})
.collect();
let combined_text = text_parts.join("\n");
write!(f, "{}", combined_text)
}
}
}
}
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Message {
pub role: String,
pub content: Option<ContentType>,
}
impl Message {
pub fn new(content: String) -> Self {
Self {
role: "user".to_string(),
content: Some(ContentType::Text(content)),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamOptions {
pub include_usage: bool,
}
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ChatCompletionsRequest {
pub model: String,
pub messages: Vec<Message>,
pub temperature: Option<f32>,
pub top_p: Option<f32>,
pub n: Option<u32>,
pub max_tokens: Option<u32>,
pub stream: Option<bool>,
pub stop: Option<Vec<String>>,
pub presence_penalty: Option<f32>,
pub frequency_penalty: Option<f32>,
pub stream_options: Option<StreamOptions>,
pub tools: Option<Vec<Value>>,
}
impl TryFrom<&[u8]> for ChatCompletionsRequest {
type Error = OpenAIError;
fn try_from(bytes: &[u8]) -> Result<Self> {
serde_json::from_slice(bytes).map_err(OpenAIError::from)
}
}
#[skip_serializing_none]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ChatCompletionsResponse {
pub id: String,
pub object: String,
pub created: u64,
pub choices: Vec<Choice>,
pub usage: Option<Usage>,
}
impl TryFrom<&[u8]> for ChatCompletionsResponse {
type Error = OpenAIError;
fn try_from(bytes: &[u8]) -> Result<Self> {
serde_json::from_slice(bytes).map_err(OpenAIError::from)
}
}
impl<'a> TryFrom<(&'a [u8], &'a Provider)> for ChatCompletionsResponse {
type Error = OpenAIError;
fn try_from(input: (&'a [u8], &'a Provider)) -> Result<Self> {
// Use input.provider as needed, if necessary
serde_json::from_slice(input.0).map_err(OpenAIError::from)
}
}
impl ChatCompletionsRequest {
pub fn to_bytes(&self, provider: Provider) -> Result<Vec<u8>> {
match provider {
Provider::OpenAI
| Provider::Arch
| Provider::Deepseek
| Provider::Mistral
| Provider::Groq
| Provider::Gemini
| Provider::Claude => serde_json::to_vec(self).map_err(OpenAIError::from),
_ => Err(OpenAIError::UnsupportedProvider {
provider: provider.to_string(),
}),
}
}
}
#[skip_serializing_none]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Choice {
pub index: u32,
pub message: Message,
pub finish_reason: Option<String>,
}
#[skip_serializing_none]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Usage {
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
}
#[skip_serializing_none]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeltaMessage {
pub role: Option<String>,
pub content: Option<ContentType>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct StreamChoice {
pub index: u32,
pub delta: DeltaMessage,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ChatCompletionStreamResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<StreamChoice>,
pub usage: Option<Usage>,
}
pub struct SseChatCompletionIter<I>
where
I: Iterator,
I::Item: AsRef<str>,
{
lines: I,
}
impl<I> SseChatCompletionIter<I>
where
I: Iterator,
I::Item: AsRef<str>,
{
pub fn new(lines: I) -> Self {
Self { lines }
}
}
impl<I> Iterator for SseChatCompletionIter<I>
where
I: Iterator,
I::Item: AsRef<str>,
{
type Item = Result<ChatCompletionStreamResponse>;
fn next(&mut self) -> Option<Self::Item> {
for line in &mut self.lines {
let line = line.as_ref();
if let Some(data) = line.strip_prefix("data: ") {
let data = data.trim();
if data == "[DONE]" {
return None;
}
if data == r#"{"type": "ping"}"# {
continue; // Skip ping messages - that is usually from anthropic
}
return Some(
serde_json::from_str::<ChatCompletionStreamResponse>(data).map_err(|e| {
OpenAIError::InvalidStreamingData {
source: e,
data: data.to_string(),
}
}),
);
}
}
None
}
}
impl<'a> TryFrom<(&'a [u8], &'a Provider)> for SseChatCompletionIter<str::Lines<'a>> {
type Error = OpenAIError;
fn try_from(input: (&'a [u8], &'a Provider)) -> Result<Self> {
let s = std::str::from_utf8(input.0)?;
// Use input.provider as needed
Ok(SseChatCompletionIter::new(s.lines()))
}
}
impl<'a> TryFrom<&'a [u8]> for SseChatCompletionIter<str::Lines<'a>> {
type Error = OpenAIError;
fn try_from(bytes: &'a [u8]) -> Result<Self> {
let s = std::str::from_utf8(bytes)?;
Ok(SseChatCompletionIter::new(s.lines()))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelDetail {
pub id: String,
pub object: String,
pub created: usize,
pub owned_by: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ModelObject {
#[serde(rename = "list")]
List,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Models {
pub object: ModelObject,
pub data: Vec<ModelDetail>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_type_display() {
let text_content = ContentType::Text("Hello, world!".to_string());
assert_eq!(text_content.to_string(), "Hello, world!");
let multi_part_content = ContentType::MultiPart(vec![
MultiPartContent {
text: Some("This is a text part.".to_string()),
content_type: MultiPartContentType::Text,
},
MultiPartContent {
text: Some("https://example.com/image.png".to_string()),
content_type: MultiPartContentType::ImageUrl,
},
]);
assert_eq!(multi_part_content.to_string(), "This is a text part.");
}
#[test]
fn test_chat_completions_request_text_type_array() {
const CHAT_COMPLETIONS_REQUEST: &str = r#"
{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What city do you want to know the weather for?"
},
{
"type": "text",
"text": "hello world"
}
]
}
]
}
"#;
let chat_completions_request: ChatCompletionsRequest =
serde_json::from_str(CHAT_COMPLETIONS_REQUEST).unwrap();
assert_eq!(chat_completions_request.model, "gpt-3.5-turbo");
if let Some(ContentType::MultiPart(multi_part_content)) =
chat_completions_request.messages[0].content.as_ref()
{
assert_eq!(multi_part_content.len(), 2);
assert_eq!(
multi_part_content[0].content_type,
MultiPartContentType::Text
);
assert_eq!(
multi_part_content[0].text,
Some("What city do you want to know the weather for?".to_string())
);
assert_eq!(
multi_part_content[1].content_type,
MultiPartContentType::Text
);
assert_eq!(multi_part_content[1].text, Some("hello world".to_string()));
} else {
panic!("Expected MultiPartContent");
}
}
#[test]
fn test_sse_streaming() {
let json_data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"content":"Hello, how can I help you today?"},"finish_reason":null}]}
data: [DONE]"#;
let iter = SseChatCompletionIter::new(json_data.lines());
println!("Testing SSE Streaming");
for item in iter {
match item {
Ok(response) => {
println!("Received response: {:?}", response);
if response.choices.is_empty() {
continue;
}
for choice in response.choices {
if let Some(content) = choice.delta.content {
println!("Content: {}", content);
}
}
}
Err(e) => {
println!("Error parsing JSON: {}", e);
return;
}
}
}
}
#[test]
fn test_sse_streaming_try_from_bytes() {
let json_data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1700000000,"model":"gpt-3.5-turbo","choices":[{"index":0,"delta":{"content":"Hello, how can I help you today?"},"finish_reason":null}]}
data: [DONE]"#;
let iter = SseChatCompletionIter::try_from(json_data.as_bytes())
.expect("Failed to create SSE iterator");
println!("Testing SSE Streaming");
for item in iter {
match item {
Ok(response) => {
println!("Received response: {:?}", response);
if response.choices.is_empty() {
continue;
}
for choice in response.choices {
if let Some(content) = choice.delta.content {
println!("Content: {}", content);
}
}
}
Err(e) => {
println!("Error parsing JSON: {}", e);
return;
}
}
}
}
#[test]
fn parse_chat_completions_request() {
const CHAT_COMPLETIONS_REQUEST: &str = r#"
{
"model": "None",
"messages": [
{
"role": "user",
"content": "how is the weather in seattle"
}
],
"stream": true
} "#;
let _chat_completions_request: ChatCompletionsRequest =
ChatCompletionsRequest::try_from(CHAT_COMPLETIONS_REQUEST.as_bytes())
.expect("Failed to parse ChatCompletionsRequest");
}
#[test]
fn stream_chunk_parse_claude() {
const CHUNK_RESPONSE: &str = r#"data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"role":"assistant"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"type": "ping"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":"Hello!"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" How can I assist you today? Whether"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" you have a question, need information"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":", or just want to chat about"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":" something, I'm here to help. What woul"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{"content":"d you like to talk about?"}}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: {"id":"msg_01DZDMxYSgq8aPQxMQoBv6Kb","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"created":1747685264,"model":"claude-3-7-sonnet-latest","object":"chat.completion.chunk"}
data: [DONE]
"#;
let iter = SseChatCompletionIter::try_from(CHUNK_RESPONSE.as_bytes());
assert!(iter.is_ok(), "Failed to create SSE iterator");
let iter: SseChatCompletionIter<str::Lines<'_>> = iter.unwrap();
let all_text: Vec<String> = iter
.map(|item| {
let response = item.expect("Failed to parse response");
response
.choices
.into_iter()
.filter_map(|choice| choice.delta.content)
.map(|content| content.to_string())
.collect::<String>()
})
.collect();
assert_eq!(
all_text.len(),
8,
"Expected 8 chunks of text, but got {}",
all_text.len()
);
assert_eq!(
all_text.join(""),
"Hello! How can I assist you today? Whether you have a question, need information, or just want to chat about something, I'm here to help. What would you like to talk about?"
);
}
}

View file

@ -22,6 +22,7 @@ rand = "0.8.5"
thiserror = "1.0.64"
derivative = "2.2.0"
sha2 = "0.10.8"
hermesllm = { version = "0.1.0", path = "../hermesllm" }
[dev-dependencies]
proxy-wasm-test-framework = { git = "https://github.com/katanemo/test-framework.git", branch = "new" }

View file

@ -1,8 +1,4 @@
use crate::metrics::Metrics;
use common::api::open_ai::{
ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
ContentType, Message, StreamOptions,
};
use common::configuration::{LlmProvider, LlmProviderType, Overrides};
use common::consts::{
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
@ -14,6 +10,11 @@ use common::ratelimit::Header;
use common::stats::{IncrementingMetric, RecordingMetric};
use common::tracing::{Event, Span, TraceData, Traceparent};
use common::{ratelimit, routing, tokenizer};
use hermesllm::providers::openai::types::{ChatCompletionsRequest, SseChatCompletionIter};
use hermesllm::providers::openai::types::{
ChatCompletionsResponse, ContentType, Message, StreamOptions,
};
use hermesllm::Provider;
use http::StatusCode;
use log::{debug, info, warn};
use proxy_wasm::hostcalls::get_current_time;
@ -201,14 +202,15 @@ impl HttpContext for StreamContext {
return Action::Continue;
}
let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
let use_agent_orchestrator = match self.overrides.as_ref() {
Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
None => false,
};
if let Some(routing_header_value) = routing_header_value.as_ref() {
let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
if routing_header_value.is_some() && !routing_header_value.as_ref().unwrap().is_empty() {
let routing_header_value = routing_header_value.as_ref().unwrap();
info!("routing header already set: {}", routing_header_value);
self.llm_provider = Some(Rc::new(LlmProvider {
name: routing_header_value.to_string(),
@ -284,27 +286,17 @@ impl HttpContext for StreamContext {
}
};
// Deserialize body into spec.
// Currently OpenAI API.
let mut deserialized_body: ChatCompletionsRequest =
match serde_json::from_slice(&body_bytes) {
Ok(deserialized) => deserialized,
Err(e) => {
debug!(
"on_http_request_body: request body: {}",
String::from_utf8_lossy(&body_bytes)
);
self.send_server_error(
ServerError::Deserialization(e),
Some(StatusCode::BAD_REQUEST),
);
return Action::Pause;
}
};
for message in deserialized_body.messages.iter_mut() {
message.model = None;
}
let mut deserialized_body = match ChatCompletionsRequest::try_from(body_bytes.as_slice()) {
Ok(deserialized) => deserialized,
Err(e) => {
debug!(
"on_http_request_body: request body: {}",
String::from_utf8_lossy(&body_bytes)
);
self.send_server_error(ServerError::OpenAIPError(e), Some(StatusCode::BAD_REQUEST));
return Action::Pause;
}
};
self.user_message = deserialized_body
.messages
@ -348,17 +340,12 @@ impl HttpContext for StreamContext {
model_name.unwrap_or(&"None".to_string()),
);
let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
debug!(
"on_http_request_body: request body: {}",
chat_completion_request_str
);
if deserialized_body.stream {
if deserialized_body.stream.unwrap_or_default() {
self.streaming_response = true;
}
if deserialized_body.stream && deserialized_body.stream_options.is_none() {
if deserialized_body.stream.unwrap_or_default()
&& deserialized_body.stream_options.is_none()
{
deserialized_body.stream_options = Some(StreamOptions {
include_usage: true,
});
@ -387,7 +374,20 @@ impl HttpContext for StreamContext {
return Action::Continue;
}
self.set_http_request_body(0, body_size, chat_completion_request_str.as_bytes());
let llm_provider_str = self.llm_provider().provider_interface.to_string();
let hermes_llm_provider = Provider::from(llm_provider_str.as_str());
// convert chat completion request to llm provider specific request
let deserialized_body_bytes = match deserialized_body.to_bytes(hermes_llm_provider) {
Ok(bytes) => bytes,
Err(e) => {
warn!("Failed to serialize request body: {}", e);
self.send_server_error(ServerError::OpenAIPError(e), Some(StatusCode::BAD_REQUEST));
return Action::Pause;
}
};
self.set_http_request_body(0, body_size, &deserialized_body_bytes);
Action::Continue
}
@ -542,58 +542,33 @@ impl HttpContext for StreamContext {
}
};
let body_utf8 = match String::from_utf8(body) {
Ok(body_utf8) => body_utf8,
Err(e) => {
warn!("could not convert to utf8: {}", e);
return Action::Continue;
}
};
let llm_provider_str = self.llm_provider().provider_interface.to_string();
let hermes_llm_provider = Provider::from(llm_provider_str.as_str());
if self.streaming_response {
if body_utf8 == "data: [DONE]\n" {
return Action::Continue;
}
let chat_completions_chunk_response_events =
match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
Ok(response) => response,
match SseChatCompletionIter::try_from((body.as_slice(), &hermes_llm_provider)) {
Ok(events) => events,
Err(e) => {
warn!(
"invalid streaming response: body str: {}, {:?}",
body_utf8, e
);
warn!("could not parse response: {}", e);
return Action::Continue;
}
};
if chat_completions_chunk_response_events.events.is_empty() {
warn!(
"couldn't parse any streaming events: body str: {}",
body_utf8
);
return Action::Continue;
for event in chat_completions_chunk_response_events {
match event {
Ok(event) => {
if let Some(usage) = event.usage.as_ref() {
self.response_tokens += usage.completion_tokens;
}
}
Err(e) => {
warn!("error in response event: {}", e);
continue;
}
}
}
let model = chat_completions_chunk_response_events
.events
.first()
.unwrap()
.model
.clone();
let tokens_str = chat_completions_chunk_response_events.to_string();
let token_count =
match tokenizer::token_count(model.as_ref().unwrap().as_str(), tokens_str.as_str())
{
Ok(token_count) => token_count,
Err(e) => {
warn!("could not get token count: {:?}", e);
return Action::Continue;
}
};
self.response_tokens += token_count;
// Compute TTFT if not already recorded
if self.ttft_duration.is_none() {
// if let Some(start_time) = self.start_time {
@ -616,24 +591,26 @@ impl HttpContext for StreamContext {
}
} else {
debug!("non streaming response");
let chat_completions_response: ChatCompletionsResponse =
match serde_json::from_str(body_utf8.as_str()) {
let chat_completions_response =
match ChatCompletionsResponse::try_from((body.as_slice(), &hermes_llm_provider)) {
Ok(de) => de,
Err(err) => {
info!(
"non chat-completion compliant response received err: {}, body: {}",
err, body_utf8
Err(e) => {
warn!("could not parse response: {}", e);
debug!(
"on_http_response_body: S[{}], response body: {}",
self.context_id,
String::from_utf8_lossy(&body)
);
self.send_server_error(
ServerError::OpenAIPError(e),
Some(StatusCode::BAD_REQUEST),
);
return Action::Continue;
}
};
if chat_completions_response.usage.is_some() {
self.response_tokens += chat_completions_response
.usage
.as_ref()
.unwrap()
.completion_tokens;
if let Some(usage) = chat_completions_response.usage {
self.response_tokens += usage.completion_tokens;
}
}

View file

@ -202,20 +202,7 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
request_headers_expectations(&mut module, http_context);
// Request Body
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem.\"\
}\
],\
\"model\": \"gpt-4\"\
}";
let chat_completions_request_body = r#"{"model":"gpt-4","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem."}]}"#;
module
.call_proxy_on_request_body(
@ -229,7 +216,6 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 21)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
@ -268,18 +254,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
request_headers_expectations(&mut module, http_context);
// Request Body
let incomplete_chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]\
}";
let incomplete_chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
@ -290,7 +265,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
.expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
.returning(Some(incomplete_chat_completions_request_body))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"))
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"))
.expect_send_local_response(
Some(StatusCode::BAD_REQUEST.as_u16().into()),
None,
@ -300,8 +275,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 14)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 13)
.expect_log(Some(LogLevel::Debug), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
@ -359,11 +333,10 @@ fn llm_gateway_request_ratelimited() {
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 107)
.expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4"))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107"))
.expect_log(Some(LogLevel::Warn), Some(r#"server error occurred: exceeded limit provider=gpt-4, selector=Header { key: "selector-key", value: "selector-value" }, tokens_used=107"#))
.expect_send_local_response(
Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
None,
@ -399,20 +372,7 @@ fn llm_gateway_request_not_ratelimited() {
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
],\
\"model\": \"gpt-4\"\
}";
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
@ -427,7 +387,6 @@ fn llm_gateway_request_not_ratelimited() {
.expect_log(Some(LogLevel::Info), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
@ -460,20 +419,7 @@ fn llm_gateway_override_model_name() {
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"model\": \"o1-mini\",\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
@ -485,8 +431,7 @@ fn llm_gateway_override_model_name() {
.returning(Some(chat_completions_request_body))
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4"))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"))
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
@ -521,19 +466,7 @@ fn llm_gateway_override_use_default_model() {
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
let chat_completions_request_body = r#"{"model":"gpt-1","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
@ -547,14 +480,13 @@ fn llm_gateway_override_use_default_model() {
// The actual call is not important in this test, we just need to grab the token_id
.expect_log(
Some(LogLevel::Info),
Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"),
Some("on_http_request_body: provider: open-ai-gpt-4, model requested: gpt-1, model selected: gpt-4"),
)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), Some("Applying ratelimit for model: gpt-4"))
.expect_log(Some(LogLevel::Debug), Some(r#"Checking limit for provider=gpt-4, with selector=Header { key: "selector-key", value: "selector-value" }, consuming tokens=29"#))
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();
@ -584,20 +516,7 @@ fn llm_gateway_override_use_model_name_none() {
normal_flow(&mut module, filter_context, http_context);
// give shorter body to avoid rate limiting
let chat_completions_request_body = "\
{\
\"model\": \"none\",\
\"messages\": [\
{\
\"role\": \"system\",\
\"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
},\
{\
\"role\": \"user\",\
\"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
}\
]
}";
let chat_completions_request_body = r#"{"model":"none","messages":[{"role":"system","content":"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},{"role":"user","content":"Compose a poem that explains the concept of recursion in programming."}]}"#;
module
.call_proxy_on_request_body(
@ -615,7 +534,6 @@ fn llm_gateway_override_use_model_name_none() {
.expect_metric_record("input_sequence_length", 29)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_log(Some(LogLevel::Debug), None)
.expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
.execute_and_expect(ReturnType::Action(Action::Continue))
.unwrap();

View file

@ -312,6 +312,7 @@ fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_to_llm_gateway() {
let args = tester::MockSettings {
@ -462,6 +463,7 @@ fn prompt_gateway_request_to_llm_gateway() {
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_no_intent_match() {
let args = tester::MockSettings {
@ -608,6 +610,7 @@ ratelimits:
}
#[test]
#[ignore]
#[serial]
fn prompt_gateway_request_no_intent_match_default_target() {
let args = tester::MockSettings {

View file

@ -7,7 +7,8 @@ Content-Type: application/json
"role": "user",
"content": "convert 100 eur"
}
]
],
"model": "none"
}
HTTP 200
[Asserts]

View file

@ -8,7 +8,8 @@ Content-Type: application/json
"content": "convert 100 eur"
}
],
"stream": true
"stream": true,
"model": "none"
}
HTTP 200
[Asserts]

View file

@ -5,9 +5,10 @@ Content-Type: application/json
"messages": [
{
"role": "user",
"content": "I am running under debt, how should I keep a tab on my expenses?"
"content": "hi"
}
]
],
"model": "none"
}
HTTP 200
[Asserts]

View file

@ -5,9 +5,10 @@ Content-Type: application/json
"messages": [
{
"role": "user",
"content": "I am running under debt, how should I keep a tab on my expenses?"
"content": "hi"
}
],
"model": "none",
"stream": true
}
HTTP 200

View file

@ -24,7 +24,7 @@ trap 'print_debug' INT TERM ERR
log starting > ../build.log
log building and running function_callling demo
log building and running function_calling demo
log ===========================================
cd ../../demos/samples_python/weather_forecast/
docker compose up weather_forecast_service --build -d

View file

@ -2,8 +2,33 @@
@openai_endpoint = https://api.openai.com
@access_key = {{$dotenv OPENAI_API_KEY}}
### openai request
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
content-type: application/json
authorization: Bearer
accept: */*
accept-encoding: deflate
user-agent: Python/3.11 aiohttp/3.11.11
content-length: 876
x-forwarded-proto: https
x-request-id: 99d7817d-a646-9497-a38d-710b1ce1325f
traceparent: 00-e4c9fc8cf9fc7714c6a15ef34852fb30-573a351a98e0cd01-01
tracestate:
x-arch-llm-provider-hint: gpt-4o-mini
{
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": "### Task:\nGenerate 1-3 broad tags categorizing the main themes of the chat history, along with 1-3 more specific subtopic tags.\n\n### Guidelines:\n- Start with high-level domains (e.g. Science, Technology, Philosophy, Arts, Politics, Business, Health, Sports, Entertainment, Education)\n- Consider including relevant subfields/subdomains if they are strongly represented throughout the conversation\n- If content is too short (less than 3 messages) or too diverse, use only [\"General\"]\n- Use the chat's primary language; default to English if multilingual\n- Prioritize accuracy over specificity\n\n### Output:\nJSON format: { \"tags\": [\"tag1\", \"tag2\", \"tag3\"] }\n\n### Chat History:\n<chat_history>\nUSER: hello\nASSISTANT: Hello! How can I assist you today?\n</chat_history>"
}
],
"stream": false
}
### test
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
Authorization: Bearer {{access_key}}
@ -15,7 +40,7 @@ Authorization: Bearer {{access_key}}
}
],
"model": "gpt-4o-mini",
"stream": true
"stream": false
}
### openai request (streaming)
@ -75,3 +100,48 @@ x-arch-llm-provider-hint: gpt-3.5-turbo-0125
}
]
}
### llm gateway request with function calling (default target)
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
Content-Type: application/json
{
"stream": true,
"model": "None",
"messages": [
{
"role": "user",
"content": "how is the weather in seattle"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get current weather at a location.",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The location to get the weather for",
"format": "City, State"
},
"unit": {
"type": "string",
"description": "The unit to return the weather in.",
"enum": ["celsius", "fahrenheit"],
"default": "celsius"
},
"days": {
"type": "string",
"description": "The number of days for the request."
}
},
"required": ["location", "days"]
}
}
}
]
}